In [32]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import numpy as np
%matplotlib inline 

In [2]:
dataFolder = Path.cwd().joinpath('Data')
df = pd.read_csv(dataFolder.joinpath('label.csv'))
df = df[['user_id', 'label']]  #removing unnecessary columns
df.head()

Unnamed: 0,user_id,label
0,3187755,unknown
1,12609678,public administration and social service profe...
2,41055794,biological and biomedical sciences
3,8477334,unknown
4,1281279,unknown


In [3]:
df = df.drop_duplicates(keep='first') # drop duplicate rows but keep the ones with different label

In [4]:
userDf = pd.read_csv(dataFolder.joinpath('user_profile.csv'))
userDf = userDf[['user_id', 'gender', 'gpa']] #removing unnecessary columns
userDf.head()

Unnamed: 0,user_id,gender,gpa
0,8838926,female,3.6
1,1525079,female,3.5
2,4513255,female,3.0
3,23554544,female,2.7
4,3416751,female,2.2


In [5]:
userDf = pd.merge(df, userDf, on='user_id')
userDf.head()

Unnamed: 0,user_id,label,gender,gpa
0,3187755,unknown,female,3.3
1,12609678,public administration and social service profe...,female,3.8
2,41055794,biological and biomedical sciences,male,3.2
3,8477334,unknown,female,3.4
4,1281279,unknown,female,3.5


In [6]:
with open(dataFolder.joinpath('bk_details.json'), 'r') as f:
    json_data = f.read()
data = json.loads(json_data)
booksDf = pd.DataFrame(data) #Convert JSON to DataFrame
booksDf["book_type"] = booksDf.book_subject_code.str[:3] #get book type by taking first 3 letters
booksDf.head()

Unnamed: 0,book_id,book_subject_code,book_title,book_type
0,22250675,EDU029000,Learning Disabilities and Related Mild Disabil...,EDU
1,42510,SCI055000,Physics for Scientists & Engineers (Chs 1-37),SCI
2,46792,ART000000,Drawing to See,ART
3,33143135,REL006000,Living God's Word,REL
4,34466221,HIS037000,Sources in Patterns of World History,HIS


In [7]:
booksDf = booksDf[booksDf.book_type.notnull()] #remove values where book type is not known

In [8]:
ordersDf = pd.read_csv(dataFolder.joinpath('user_bk_orders.csv'))
ordersDf =  ordersDf.loc[:, ~ordersDf.columns.str.contains('^Unnamed')] #removing unnecessary columns
ordersDf.head()

Unnamed: 0,user_id,book_id
0,35240460,132078
1,26215619,97411
2,1834930,7995
3,40157812,80721
4,41179202,56886


In [9]:
ordersDf = pd.merge(ordersDf, booksDf, on='book_id')
ordersDf.head()

Unnamed: 0,user_id,book_id,book_subject_code,book_title,book_type
0,35240460,132078,SCI015000,Big Bang,SCI
1,26215619,97411,PER010000,Voice and the Actor,PER
2,1834930,7995,PSY000000,Psychological Testing,PSY
3,40157812,80721,PHI000000,Kant,PHI
4,41179202,56886,PHI000000,Plato,PHI


In [10]:
countsDf = ordersDf[['user_id','book_type']].groupby(['user_id','book_type'])['book_type'].count().reset_index(name='counts')
countsDf = pd.pivot_table(countsDf, index='user_id', columns='book_type', values='counts', fill_value=0)
countsDf = countsDf.rename_axis(None, axis=1).reset_index()
countsDf.head()

Unnamed: 0,user_id,ARC,ART,BIB,BIO,BUS,CGN,CKB,COM,CRA,...,SCI,SEL,SOC,SPO,STU,TEC,TRA,TRU,TRV,YAN
0,1197529,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1197915,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1199673,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1199951,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1201529,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
userDf = pd.merge(userDf, countsDf, on='user_id')
userDf.head()

Unnamed: 0,user_id,label,gender,gpa,ARC,ART,BIB,BIO,BUS,CGN,...,SCI,SEL,SOC,SPO,STU,TEC,TRA,TRU,TRV,YAN
0,3187755,unknown,female,3.3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,12609678,public administration and social service profe...,female,3.8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,41055794,biological and biomedical sciences,male,3.2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,8477334,unknown,female,3.4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1281279,unknown,female,3.5,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
genderMapping = {'male' : 1, 'female' : 0} #convert gender to binary value
userDf = userDf.replace({'gender':genderMapping})
userDf.head()

Unnamed: 0,user_id,label,gender,gpa,ARC,ART,BIB,BIO,BUS,CGN,...,SCI,SEL,SOC,SPO,STU,TEC,TRA,TRU,TRV,YAN
0,3187755,unknown,0,3.3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,12609678,public administration and social service profe...,0,3.8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,41055794,biological and biomedical sciences,1,3.2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,8477334,unknown,0,3.4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1281279,unknown,0,3.5,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
userDf = userDf.set_index('user_id')
testDf = userDf[userDf.label=='unknown']
trainDf = userDf[userDf.label!='unknown']

In [14]:
trainDf.label.value_counts()

psychology                                              7009
education                                               6872
biological and biomedical sciences                      4417
social sciences                                         3922
public administration and social service professions    2345
visual and performing arts                              1788
personal and culinary services                           185
Name: label, dtype: int64

In [15]:
# baseline is the % of majority class
print("Baseline: {}%".format(len(trainDf[trainDf.label=='psychology'])/len(trainDf)*100))

Baseline: 26.41118396261964%


In [48]:
trainY = trainDf.label

# trainY = trainY.replace(labelMapping)
trainX = trainDf.drop(['label'], axis=1)
testX = testDf.drop(['label'], axis=1)
# trainX = trainX.as_matrix()

In [17]:
trainX

Unnamed: 0_level_0,gender,gpa,ARC,ART,BIB,BIO,BUS,CGN,CKB,COM,...,SCI,SEL,SOC,SPO,STU,TEC,TRA,TRU,TRV,YAN
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12609678,0,3.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41055794,1,3.2,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
40112514,1,3.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30860483,1,3.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40520146,1,3.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4693027,0,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13465898,0,3.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35276680,1,2.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38664940,1,3.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40135650,1,3.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
train_samples, n_features = trainX.shape
n_classes = len(trainY.unique())

In [26]:
seed = 111
solver = 'saga'
scoring = 'accuracy'
models = {'ovr': {'name': 'One versus Rest'},
          'multinomial': {'name': 'Multinomial'}}

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
lr = LogisticRegression(solver='liblinear',
                            multi_class='ovr',
                            C=1,
                            penalty='l1',
                            fit_intercept=True,
                            max_iter=500,
                            random_state=seed,
                            )
kfold = cross_validation.KFold(n=train_samples, n_folds=10, random_state=seed)
cv_results = cross_validation.cross_val_score(lr, trainX, trainY, cv=kfold, scoring=scoring, n_jobs=4)
print("Mean Accuracy: ", np.mean(cv_results))

print(cv_results)

Mean Accuracy:  0.4438171258824307
[0.44687265 0.45327807 0.45440844 0.42991711 0.41974378 0.45139412
 0.43519216 0.44084401 0.43686393 0.46965699]


In [49]:
lr.fit(trainX,trainY)


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=111, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [51]:
 predicted = lr.predict(testX)

In [54]:
testX['label'] =  predicted

In [57]:
testX.label.value_counts()

education                                               9277
psychology                                              6781
biological and biomedical sciences                      4669
social sciences                                         4459
visual and performing arts                              1060
public administration and social service professions     151
personal and culinary services                            46
Name: label, dtype: int64

In [58]:
trainDf.label.value_counts()

psychology                                              7009
education                                               6872
biological and biomedical sciences                      4417
social sciences                                         3922
public administration and social service professions    2345
visual and performing arts                              1788
personal and culinary services                           185
Name: label, dtype: int64

In [82]:
df = pd.concat([trainDf,testX]).reset_index()

In [84]:
df = df[['user_id', 'label']]

In [93]:
#save file to csv
df[df.label.isin(['psychology', 'education', 'biological and biomedical sciences'])].to_csv("responce.csv") 

In [86]:
coefficients = pd.concat([pd.DataFrame(trainX.columns), pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

In [87]:
coefficients

Unnamed: 0,0,0.1,1,2,3,4,5,6
0,gender,0.187212,-0.097388,0.050927,-0.211325,-0.08627,0.309179,0.008374
1,gpa,0.036165,-0.003185,-0.021883,-0.016204,0.089424,-0.086317,-0.016439
2,ARC,-0.872593,-1.563784,0.0,-1.586299,-1.144852,-0.037393,3.04058
3,ART,-0.369621,-0.116152,-0.141581,-0.621986,-0.914417,-0.575082,1.769007
4,BIB,0.0,0.384239,0.0,0.0,0.0,0.0,-0.083257
5,BIO,-0.323226,0.0,-0.304116,0.0,-0.28268,0.50723,-0.436986
6,BUS,-0.617242,-0.460825,1.161994,-0.350615,0.027523,0.842388,0.102357
7,CGN,0.0,-0.737855,0.0,0.0,-0.407272,0.485567,0.696003
8,CKB,-0.465402,-1.84398,4.904891,-0.575466,-0.407475,-0.175342,-0.273614
9,COM,-0.436318,-0.077509,0.094445,-0.2376,-0.083082,0.01039,0.882175
