# Classification Analysis (Chapter 4)

+ This notebook contains the tools used to do a basic classification analysis according the contributions of the participants at the CRD game. These results are summarized in Chapter 4 (Section 4.3) of the Master Thesis report. 

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## Load Data:

In [35]:
# CLIMATE DAU 
partida_dau = pd.read_csv("dades/dades_csv/climatedau/game_partida.csv", delimiter=',' , encoding="latin-1")
ronda_dau = pd.read_csv("dades/dades_csv/climatedau/game_ronda.csv", delimiter=',' , encoding="latin-1")
user_dau = pd.read_csv("dades/dades_csv/climatedau/game_user.csv", delimiter=',' , encoding="latin-1")

user_dau.drop(user_dau.index[3], inplace=True)
userronda_dau = pd.read_csv("dades/dades_csv/climatedau/game_userronda.csv", delimiter=',' , encoding="latin-1")
user_dau['genere'] =user_dau['genere'].map({'d': 1, 'h': 0})


eq_user_dau = pd.read_csv("dades/eq_user_dau.csv", delimiter=',' , encoding="latin-1")
ineq_user_dau = user_dau[~user_dau.user_id.isin(eq_user_dau['user_id'])]
eq_user_dau =eq_user_dau.set_index('user_id')
ineq_user_dau =ineq_user_dau.set_index('user_id')

In [46]:
eq_user_dau['genere']=eq_user_dau['genere'].map({'d':1,'h':0})

In [25]:
# CLIMATE STREET 
partida_street = pd.read_csv("dades/dades_csv/climatestreet/game_partida_street.csv", delimiter=',' , encoding="latin-1")
ronda_street = pd.read_csv("dades/dades_csv/climatestreet/game_ronda_street.csv", delimiter=',' , encoding="latin-1")
user_street = pd.read_csv("dades/dades_csv/climatestreet/game_user_street.csv", delimiter=',' , encoding="latin-1")
user_street.drop(user_street[user_street['diners_inicials']== 0].index, inplace=True)
userronda_street = pd.read_csv("dades/dades_csv/climatestreet/game_userronda_street.csv", delimiter=',' , encoding="latin-1")
user_street['genere'] =user_street['genere'].map({'d': 1, 'h': 0})

# CLIMATE VIL
partida_vil = pd.read_csv("dades/dadesvil/partida.csv", delimiter=',' , encoding="latin-1")
ronda_vil = pd.read_csv("dades/dadesvil/ronda.csv", delimiter=',' , encoding="latin-1")
user_vil = pd.read_csv("dades/dadesvil/user.csv", delimiter=',' , encoding="latin-1")
user_vil.drop(user_vil[user_vil['diners_inicials']== 0].index, inplace=True)
userronda_vil = pd.read_csv("dades/dadesvil/userronda.csv", delimiter=',' , encoding="latin-1")
user_vil['genere'] =user_vil['genere'].map({'F': 1, 'M': 0})

user_street.rename(columns={'id': 'user_id'}, inplace=True)
user_vil.rename(columns={'id': 'user_id'}, inplace=True)

In [3]:
def contribution_round(user, userronda, norm):
    tonorm = user[['user_id','diners_inicials']]
    tocontr = pd.merge(userronda,tonorm, on = 'user_id')
    #Normalization
    tocontr['selnorm'] = (tocontr['seleccio']/tocontr['diners_inicials'])*10
    rondes = np.arange(1,11)
    ronda_id = np.tile(rondes,len(np.unique(userronda['user_id']))) #324 users
    tocontr['ronda_id'] = ronda_id
    contr_round = pd.pivot(index = tocontr['user_id'], columns = tocontr['ronda_id'], values = tocontr[norm])
    return contr_round

In [26]:
# Normalized contributions per round
## DAU
contr_dau_norm = contribution_round(user_dau,userronda_dau, 'selnorm')
contr_dau = contribution_round(user_dau,userronda_dau, 'seleccio')

## DAU eq: 
eq_dau = pd.read_csv("dades/eq_dau.csv", delimiter=',' , encoding="latin-1")
eq_contr_dau = eq_dau.iloc[:,2:12]
eq_contr_dau = eq_contr_dau.set_index(eq_dau.user_id)
eq_contr_dau_norm = contr_dau_norm[contr_dau_norm.index.isin(eq_dau['user_id'])]
namesList = [np.linspace(1,10,10,dtype=int)]
eq_contr_dau.columns = namesList
eq_contr_dau_norm.columns = namesList

## DAU ineq:
ineq_dau = pd.read_csv("dades/ineq_dau.csv", delimiter=',' , encoding="latin-1")
ineq_contr_dau = ineq_dau.iloc[:,2:12]
ineq_contr_dau = ineq_contr_dau.set_index(ineq_dau.user_id)
ineq_contr_dau_norm = contr_dau_norm[contr_dau_norm.index.isin(ineq_dau['user_id'])]
ineq_contr_dau.columns = namesList
ineq_contr_dau_norm.columns = namesList

## STREET
contr_street_norm = contribution_round(user_street,userronda_street, 'selnorm')
contr_street = contribution_round(user_street,userronda_street, 'seleccio')

## VIL
contr_vil_norm = contribution_round(user_vil,userronda_vil, 'selnorm')
contr_vil = contribution_round(user_vil,userronda_vil, 'seleccio')

In [16]:
## Datasets: Heterogeneous, Homogeneous, Heterogeneous norm, Homogeneous norm

frames_het = [ineq_contr_dau, contr_street]
heterogeneous = pd.concat(frames_het)

frames_hom = [eq_contr_dau, contr_vil]
homogeneous = pd.concat(frames_hom)

frames_het = [ineq_contr_dau_norm, contr_street_norm]
heterogeneous_norm = pd.concat(frames_het)

frames_hom = [eq_contr_dau_norm, contr_vil_norm]
homogeneous_norm = pd.concat(frames_hom)

In [47]:
y_het = pd.concat([ineq_user_dau.genere,user_street.genere])
y_hom = pd.concat([eq_user_dau.genere,user_vil.genere])

In [53]:
y_hom.shape

(342,)

In [52]:
heterogeneous.shape

(270, 10)

### Classification according the genere:

+ In this notebook I included the basic results for different classifiers and their scores in the assignation a gender to a each user. This results allows to discuss the results of ...

+ I used the 5-fold cross validation.

In [41]:
def classification(contr_ds,data_genere):
    X = contr_ds
    y = data_genere
    X = np.asarray(contr_ds)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)  
    #Logistic Regression
    kfold = model_selection.KFold(n_splits=5, random_state=7)
    logreg = LogisticRegression(solver = 'newton-cg')
    scoring = 'accuracy'
    results = model_selection.cross_val_score(logreg, X_train, y_train, cv=kfold, scoring=scoring)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    logreg.fit(X_train, y_train)
    print('Accuracy of LogisticRegression classifier on training set: {:.2f}'
         .format(logreg.score(X_train, y_train)))
    print('Accuracy of LogisticRegression classifier on test set: {:.2f}'
         .format(logreg.score(X_test, y_test)))
    ## Confusion matrix
    pred =logreg.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    print('\n')
    ## Decision Tree
    tree = DecisionTreeClassifier()
    scoring = 'accuracy'
    results = model_selection.cross_val_score(tree, X_train, y_train, cv=kfold, scoring=scoring)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    tree.fit(X_train, y_train)
    print('Accuracy of DecisionTree classifier on training set: {:.2f}'
         .format(tree.score(X_train, y_train)))
    print('Accuracy of DecisionTree classifier on test set: {:.2f}'
         .format(tree.score(X_test, y_test)))  
    ## Confusion matrix
    pred =tree.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    print('\n')
    ## KNNeigbors    
    knn = KNeighborsClassifier()
    scoring = 'accuracy'
    results = model_selection.cross_val_score(knn, X_train, y_train, cv=kfold, scoring=scoring)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    knn.fit(X_train, y_train)
    print('Accuracy of KNN classifier on training set: {:.2f}'
         .format(knn.score(X_train, y_train)))
    print('Accuracy of KNN classifier on test set: {:.2f}'
         .format(knn.score(X_test, y_test)))
    ## Confusion matrix
    pred =knn.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    print('\n')
    ## Linear Discriminant Analysis     
    lda = LinearDiscriminantAnalysis()
    results = model_selection.cross_val_score(lda, X_train, y_train, cv=kfold, scoring=scoring)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    lda.fit(X_train, y_train)
    print('Accuracy of LDA classifier on training set: {:.3f}'
         .format(lda.score(X_train, y_train)))
    print('Accuracy of LDA classifier on test set: {:.3f}'
         .format(lda.score(X_test, y_test)))
    ## Confusion matrix
    pred =lda.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    print('\n')
    ## Linear Discriminant Analysis  
    gnb = GaussianNB()
    results = model_selection.cross_val_score(gnb, X_train, y_train, cv=kfold, scoring=scoring)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    gnb.fit(X_train, y_train)
    print('Accuracy of GNB classifier on training set: {:.3f}'
         .format(gnb.score(X_train, y_train)))
    print('Accuracy of GNB classifier on test set: {:.3f}'
         .format(gnb.score(X_test, y_test)))
    ## Confusion matrix
    pred =gnb.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    print('\n')
    ## Support Vector Machine   
    svm = SVC()
    results = model_selection.cross_val_score(svm, X_train, y_train, cv=kfold, scoring=scoring)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    svm.fit(X_train, y_train)
    print('Accuracy of SVM classifier on training set: {:.3f}'
         .format(svm.score(X_train, y_train)))
    print('Accuracy of SVM classifier on test set: {:.3f}'
         .format(svm.score(X_test, y_test)))
    ## Confusion matrix
    pred = svm.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))

### Classification results at treatment level

In [49]:
### Heterogeneous:
classification(heterogeneous_norm,y_het)

5-fold cross validation average accuracy: 0.519
Accuracy of LogisticRegression classifier on training set: 0.62
Accuracy of LogisticRegression classifier on test set: 0.61
[[27  4]
 [17  6]]
             precision    recall  f1-score   support

          0       0.61      0.87      0.72        31
          1       0.60      0.26      0.36        23

avg / total       0.61      0.61      0.57        54



5-fold cross validation average accuracy: 0.510
Accuracy of DecisionTree classifier on training set: 1.00
Accuracy of DecisionTree classifier on test set: 0.52
[[19 12]
 [14  9]]
             precision    recall  f1-score   support

          0       0.58      0.61      0.59        31
          1       0.43      0.39      0.41        23

avg / total       0.51      0.52      0.52        54



5-fold cross validation average accuracy: 0.551
Accuracy of KNN classifier on training set: 0.72
Accuracy of KNN classifier on test set: 0.37
[[14 17]
 [17  6]]
             precision    recall  f

  'precision', 'predicted', average, warn_for)


In [50]:
### Homogeneous:
classification(homogeneous_norm,y_hom)

5-fold cross validation average accuracy: 0.678
Accuracy of LogisticRegression classifier on training set: 0.68
Accuracy of LogisticRegression classifier on test set: 0.70
[[48  0]
 [21  0]]
             precision    recall  f1-score   support

          0       0.70      1.00      0.82        48
          1       0.00      0.00      0.00        21

avg / total       0.48      0.70      0.57        69



5-fold cross validation average accuracy: 0.572
Accuracy of DecisionTree classifier on training set: 0.99
Accuracy of DecisionTree classifier on test set: 0.55
[[34 14]
 [17  4]]
             precision    recall  f1-score   support

          0       0.67      0.71      0.69        48
          1       0.22      0.19      0.21        21

avg / total       0.53      0.55      0.54        69



5-fold cross validation average accuracy: 0.608
Accuracy of KNN classifier on training set: 0.74
Accuracy of KNN classifier on test set: 0.61
[[38 10]
 [17  4]]
             precision    recall  f

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Classification results at dataset level

In [6]:
### Heterogeneous DAU:
classification(ineq_contr_dau_norm,user_dau[user_dau.user_id.isin(ineq_contr_dau_norm.index)].genere)

5-fold cross validation average accuracy: 0.536
Accuracy of LogisticRegression classifier on training set: 0.64
Accuracy of LogisticRegression classifier on test set: 0.64
[[17  3]
 [ 9  4]]
             precision    recall  f1-score   support

          0       0.65      0.85      0.74        20
          1       0.57      0.31      0.40        13

avg / total       0.62      0.64      0.61        33



5-fold cross validation average accuracy: 0.551
Accuracy of DecisionTree classifier on training set: 0.99
Accuracy of DecisionTree classifier on test set: 0.70
[[14  6]
 [ 4  9]]
             precision    recall  f1-score   support

          0       0.78      0.70      0.74        20
          1       0.60      0.69      0.64        13

avg / total       0.71      0.70      0.70        33



5-fold cross validation average accuracy: 0.543
Accuracy of KNN classifier on training set: 0.69
Accuracy of KNN classifier on test set: 0.55
[[12  8]
 [ 7  6]]
             precision    recall  f

In [7]:
### STREET:
classification(contr_street_norm,user_street.genere)

5-fold cross validation average accuracy: 0.499
Accuracy of LogisticRegression classifier on training set: 0.62
Accuracy of LogisticRegression classifier on test set: 0.41
[[ 9  2]
 [11  0]]
             precision    recall  f1-score   support

          0       0.45      0.82      0.58        11
          1       0.00      0.00      0.00        11

avg / total       0.23      0.41      0.29        22



5-fold cross validation average accuracy: 0.580
Accuracy of DecisionTree classifier on training set: 1.00
Accuracy of DecisionTree classifier on test set: 0.50
[[9 2]
 [9 2]]
             precision    recall  f1-score   support

          0       0.50      0.82      0.62        11
          1       0.50      0.18      0.27        11

avg / total       0.50      0.50      0.44        22



5-fold cross validation average accuracy: 0.580
Accuracy of KNN classifier on training set: 0.76
Accuracy of KNN classifier on test set: 0.50
[[8 3]
 [8 3]]
             precision    recall  f1-score 

  'precision', 'predicted', average, warn_for)


In [8]:
### Homogeneous DAU:
classification(eq_contr_dau_norm,user_dau[user_dau.user_id.isin(eq_contr_dau_norm.index)].genere)

5-fold cross validation average accuracy: 0.581
Accuracy of LogisticRegression classifier on training set: 0.65
Accuracy of LogisticRegression classifier on test set: 0.42
[[12  4]
 [15  2]]
             precision    recall  f1-score   support

          0       0.44      0.75      0.56        16
          1       0.33      0.12      0.17        17

avg / total       0.39      0.42      0.36        33



5-fold cross validation average accuracy: 0.621
Accuracy of DecisionTree classifier on training set: 1.00
Accuracy of DecisionTree classifier on test set: 0.58
[[11  5]
 [ 9  8]]
             precision    recall  f1-score   support

          0       0.55      0.69      0.61        16
          1       0.62      0.47      0.53        17

avg / total       0.58      0.58      0.57        33



5-fold cross validation average accuracy: 0.604
Accuracy of KNN classifier on training set: 0.75
Accuracy of KNN classifier on test set: 0.42
[[10  6]
 [13  4]]
             precision    recall  f

  'precision', 'predicted', average, warn_for)


In [9]:
### VIL:
classification(contr_vil_norm,user_vil.genere)

5-fold cross validation average accuracy: 0.722
Accuracy of LogisticRegression classifier on training set: 0.74
Accuracy of LogisticRegression classifier on test set: 0.83
[[30  1]
 [ 5  0]]
             precision    recall  f1-score   support

          0       0.86      0.97      0.91        31
          1       0.00      0.00      0.00         5

avg / total       0.74      0.83      0.78        36



5-fold cross validation average accuracy: 0.596
Accuracy of DecisionTree classifier on training set: 0.99
Accuracy of DecisionTree classifier on test set: 0.58
[[20 11]
 [ 4  1]]
             precision    recall  f1-score   support

          0       0.83      0.65      0.73        31
          1       0.08      0.20      0.12         5

avg / total       0.73      0.58      0.64        36



5-fold cross validation average accuracy: 0.701
Accuracy of KNN classifier on training set: 0.75
Accuracy of KNN classifier on test set: 0.86
[[31  0]
 [ 5  0]]
             precision    recall  f

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
