In [128]:
import numpy as np
import pandas as pd
import hashlib

In [129]:
courses = pd.read_csv('raw_data/courses.csv', dtype={'course_id': str})
reviews = pd.read_csv('raw_data/reviews.csv', dtype={'course_id': str})
leads = pd.read_csv('raw_data/leads.csv', dtype={'course_id': str})
enrollments = pd.read_csv('raw_data/enrolments.csv', dtype={'course_id': str})

In [130]:
def hash_id(id):
    if pd.isnull(id):
        return np.nan
    
    return hashlib.md5(id.encode()).hexdigest()

### Hash course ids and user ids

In [131]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 5 columns):
user_id            40000 non-null object
course_id          40000 non-null object
rating             40000 non-null int64
would_recommend    40000 non-null int64
created_on         40000 non-null object
dtypes: int64(2), object(3)
memory usage: 1.5+ MB


In [132]:
reviews.head()

Unnamed: 0,user_id,course_id,rating,would_recommend,created_on
0,99999992768213172242341862848683,3468302,10,1,2019-11-28 17:36:56.0
1,99999992768213172242341864619086,3546634,8,1,2019-11-27 23:20:28.0
2,99999992768213172242341862276225,3463038,10,1,2019-11-27 12:48:58.0
3,99999992768213172242341864879224,3005366,8,1,2019-11-24 15:09:45.0
4,99999992768213172242341864788114,3598509,10,1,2019-11-24 13:50:17.0


In [133]:
reviews['user_id'].nunique(), reviews['course_id'].nunique()

(32609, 16173)

In [134]:
reviews['user_id'] = reviews['user_id'].apply(hash_id)
reviews['course_id'] = reviews['course_id'].apply(hash_id)

In [135]:
reviews['user_id'].nunique(), reviews['course_id'].nunique()

(32609, 16173)

In [136]:
reviews.head()

Unnamed: 0,user_id,course_id,rating,would_recommend,created_on
0,51032d220ab2279b636f1b6281b4ae58,767e2c2694eba8846b43907ce13339d6,10,1,2019-11-28 17:36:56.0
1,28c2598b9595df51264eb811492d9394,31ad7e4bb73e736dcc219b65ad70f9f2,8,1,2019-11-27 23:20:28.0
2,559ead19ade9b2075cc673806be024d3,0dc630bd6b91c59bd642268c55d5d369,10,1,2019-11-27 12:48:58.0
3,13043f6b28ef0eb89d0de5fd37840006,f1773d38b79747ac29664cbf6ba78242,8,1,2019-11-24 15:09:45.0
4,5da6b9b0733df5317edab20016605642,96583309b833d38b83fadfbf994f0e1c,10,1,2019-11-24 13:50:17.0


In [137]:
reviews.to_csv('data/reviews.csv', index=False)

### Hashing course ids

In [138]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41899 entries, 0 to 41898
Data columns (total 15 columns):
course_id      41899 non-null object
center_id      41899 non-null object
url            41899 non-null object
title          41899 non-null object
description    41649 non-null object
syllabus       41895 non-null object
methodology    41899 non-null object
category       41899 non-null object
family         41897 non-null object
course_type    41899 non-null object
price_range    34688 non-null object
duration       15339 non-null object
language       41899 non-null object
region         13424 non-null float64
updated_on     41899 non-null object
dtypes: float64(1), object(14)
memory usage: 4.8+ MB


In [139]:
courses.head()

Unnamed: 0,course_id,center_id,url,title,description,syllabus,methodology,category,family,course_type,price_range,duration,language,region,updated_on
0,3328746,99997192958079401036578245533930,/curso-calculo-construccion-pilotes-zonas-sism...,Curso de cálculo y construcción de pilotes en ...,Se trata de un curso de alto nivel para el con...,<p>El curso se encuadrar&aacute; en siete de U...,Online,Ingeniería civil,Inmobiliaria y construcción,Curso,151-300 €,Año(s),Castellano,,2019-12-02 11:50:33.0
1,3528894,99997192958079401036578245533930,/curso-diseno-calculo-estructuras-acero-metodo...,Curso de Diseño y cálculo de estructuras de ac...,Dadas las nuevas tendencias y requerimientos B...,<p>TEMA 01 | INTRODUCCI&Oacute;N AL DISE&Ntild...,Online,Ingeniería civil,Inmobiliaria y construcción,Curso,151-300 €,Año(s),Castellano,,2019-12-02 11:50:12.0
2,2780144,40181100082250516849485256694567,/curso-para-profundizar-norma-ifs-cursos-27801...,Norma Ifs - Food V.6.1 (online),¿Sabes cómo gestionar la seguridad alimentaria...,<p><strong>M&oacute;dulo 1.&nbsp; Fundamentos ...,Online,Calidad alimentaria,Industria,Curso,151-300 €,,Castellano,,2019-12-02 11:47:44.0
3,2678879,51859010040551486953485250494565,/gestion-del-modelo-financiacion-formacion-con...,Gestión del Modelo de Financiación de la Forma...,¿Te gustaría conocer hasta el último detalle s...,<p><strong>DIA 1- Nivel intermedio</strong><br...,Presencial,Formación de personal,Administración de empresas,Curso subvencionado para trabajadores,501-1000 €,Mes(es),Castellano,35.0,2019-12-02 11:46:14.0
4,2466756,51859010040551486953485250494565,/gestion-del-modelo-financiacion-formacion-con...,Gestión del Modelo de Financiación de la Forma...,Si quieres aprender a gestionar la formación c...,<p>&bull; Alternativas de Gesti&oacute;n:</p>\...,Presencial,Formación de personal,Administración de empresas,Curso subvencionado para trabajadores,301-500 €,Día(s),Castellano,35.0,2019-12-02 11:45:07.0


In [140]:
courses['course_id'].nunique()

41899

In [141]:
courses['course_id'] = courses['course_id'].apply(hash_id)

In [142]:
courses['course_id'].nunique()

41899

In [143]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41899 entries, 0 to 41898
Data columns (total 15 columns):
course_id      41899 non-null object
center_id      41899 non-null object
url            41899 non-null object
title          41899 non-null object
description    41649 non-null object
syllabus       41895 non-null object
methodology    41899 non-null object
category       41899 non-null object
family         41897 non-null object
course_type    41899 non-null object
price_range    34688 non-null object
duration       15339 non-null object
language       41899 non-null object
region         13424 non-null float64
updated_on     41899 non-null object
dtypes: float64(1), object(14)
memory usage: 4.8+ MB


In [144]:
courses.head()

Unnamed: 0,course_id,center_id,url,title,description,syllabus,methodology,category,family,course_type,price_range,duration,language,region,updated_on
0,25d0482af56ef761d627104ff58241d2,99997192958079401036578245533930,/curso-calculo-construccion-pilotes-zonas-sism...,Curso de cálculo y construcción de pilotes en ...,Se trata de un curso de alto nivel para el con...,<p>El curso se encuadrar&aacute; en siete de U...,Online,Ingeniería civil,Inmobiliaria y construcción,Curso,151-300 €,Año(s),Castellano,,2019-12-02 11:50:33.0
1,3f2735cdb4da9a61b532a50d47790e26,99997192958079401036578245533930,/curso-diseno-calculo-estructuras-acero-metodo...,Curso de Diseño y cálculo de estructuras de ac...,Dadas las nuevas tendencias y requerimientos B...,<p>TEMA 01 | INTRODUCCI&Oacute;N AL DISE&Ntild...,Online,Ingeniería civil,Inmobiliaria y construcción,Curso,151-300 €,Año(s),Castellano,,2019-12-02 11:50:12.0
2,3cc4c8696dcd68dd606842d2323827e1,40181100082250516849485256694567,/curso-para-profundizar-norma-ifs-cursos-27801...,Norma Ifs - Food V.6.1 (online),¿Sabes cómo gestionar la seguridad alimentaria...,<p><strong>M&oacute;dulo 1.&nbsp; Fundamentos ...,Online,Calidad alimentaria,Industria,Curso,151-300 €,,Castellano,,2019-12-02 11:47:44.0
3,58cf421dd698d0ee689eb5d4208e7dac,51859010040551486953485250494565,/gestion-del-modelo-financiacion-formacion-con...,Gestión del Modelo de Financiación de la Forma...,¿Te gustaría conocer hasta el último detalle s...,<p><strong>DIA 1- Nivel intermedio</strong><br...,Presencial,Formación de personal,Administración de empresas,Curso subvencionado para trabajadores,501-1000 €,Mes(es),Castellano,35.0,2019-12-02 11:46:14.0
4,07eefa376a6f7bc75b8ad706f67c57af,51859010040551486953485250494565,/gestion-del-modelo-financiacion-formacion-con...,Gestión del Modelo de Financiación de la Forma...,Si quieres aprender a gestionar la formación c...,<p>&bull; Alternativas de Gesti&oacute;n:</p>\...,Presencial,Formación de personal,Administración de empresas,Curso subvencionado para trabajadores,301-500 €,Día(s),Castellano,35.0,2019-12-02 11:45:07.0


### Remove center info and url

In [145]:
courses.drop(['center_id', 'url'], axis=1, inplace=True)

In [146]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41899 entries, 0 to 41898
Data columns (total 13 columns):
course_id      41899 non-null object
title          41899 non-null object
description    41649 non-null object
syllabus       41895 non-null object
methodology    41899 non-null object
category       41899 non-null object
family         41897 non-null object
course_type    41899 non-null object
price_range    34688 non-null object
duration       15339 non-null object
language       41899 non-null object
region         13424 non-null float64
updated_on     41899 non-null object
dtypes: float64(1), object(12)
memory usage: 4.2+ MB


In [147]:
courses.to_csv('data/courses.csv', index=False)

### Hashing course ids and user ids

In [148]:
leads['user_id'].nunique(), leads['course_id'].nunique()

(24606, 22262)

In [149]:
leads['user_id'] = leads['user_id'].apply(hash_id)
leads['course_id'] = leads['course_id'].apply(hash_id)

In [150]:
leads['user_id'].nunique(), leads['course_id'].nunique()

(24606, 22262)

In [151]:
leads.head()

Unnamed: 0,user_id,course_id
0,10fb910da113efc5cd74b81065400e12,e0c424c5fc399cdbb1bf8549bde633c5
1,f43f477510f8c2048e600a85ab339369,f8ddab8ef084cc588c984b58767c5cbb
2,c7011ab2cb1b9a24e59e5ba4ba217d64,3395fa729651491cd1a84502ed090327
3,8596ef53d30e7baf338fe6854bc0a443,4e0abe5d6fd932aea04bdbd79360bb4c
4,f88b4e58210523ee2351dcf7212781bc,12551267b905d157247a6618809a8e50


In [152]:
leads.to_csv('data/leads.csv', index=False)

### Hashing course ids and user ids

In [153]:
enrollments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36473 entries, 0 to 36472
Data columns (total 3 columns):
user_id       36473 non-null object
course_id     36473 non-null object
created_on    36473 non-null object
dtypes: object(3)
memory usage: 854.9+ KB


In [154]:
enrollments['user_id'].nunique(), enrollments['course_id'].nunique()

(34069, 17707)

In [155]:
enrollments['user_id'] = enrollments['user_id'].apply(hash_id)
enrollments['course_id'] = enrollments['course_id'].apply(hash_id)

In [156]:
enrollments['user_id'].nunique(), enrollments['course_id'].nunique()

(34069, 17707)

In [157]:
enrollments.to_csv('data/enrollments.csv', index=False)

In [158]:
enrollments.head()

Unnamed: 0,user_id,course_id,created_on
0,4e44de9990a37934ff841620eb23aa60,9986b296a935c8e12c5174002c553d28,2019-12-02 11:03:00.0
1,01894f391119850a080f75a9d5465e07,c50abb2999708df662f05bb5124becbe,2019-12-02 10:58:00.0
2,bb0129ee0be40917aab04e6a0d64d6fc,859d20d9ad7964bc61e1db2f89ac6d26,2019-12-02 10:57:00.0
3,cbbcb1a5990b8146a604ab348291b1e7,85881119076acb7f2db0eab38e00de54,2019-12-02 10:56:00.0
4,250c1e5e5bcbcfbb5aedd7eeca2fb892,51cdfed4901b031d70c01eb21db59b03,2019-12-02 10:45:00.0


In [162]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41899 entries, 0 to 41898
Data columns (total 13 columns):
course_id      41899 non-null object
title          41899 non-null object
description    41649 non-null object
syllabus       41895 non-null object
methodology    41899 non-null object
category       41899 non-null object
family         41897 non-null object
course_type    41899 non-null object
price_range    34688 non-null object
duration       15339 non-null object
language       41899 non-null object
region         13424 non-null float64
updated_on     41899 non-null object
dtypes: float64(1), object(12)
memory usage: 4.2+ MB


In [173]:
courses['methodology'].value_counts()

Online            19490
Presencial        10328
A distancia        8506
Semipresencial     3463
In company          112
Name: methodology, dtype: int64

In [174]:
10328 + 112 + 3463

13903

In [191]:
online_courses = courses['methodology'] == 'Online'
distance_courses = courses['methodology'] == 'A distancia'
with_region = pd.isnull(courses['region'])

courses.loc[(distance_courses | online_courses) & ~with_region].shape

(66, 13)