# Factors affecting drinking in highschool students

#### Data info:
The data set was collected in Portugal for students of two schools

# Load Data and Libraries

#### Loaded Portuguese Class Data - Math Class only added 13 unique data points, which isn't very much out of almost 700

In [156]:
import pandas as pd

df = pd.read_csv('Capstone Project/student-portuguese.csv')



In [157]:
#other libraries and stats models to import - having all of them in a single place makes it easier to track
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.neighbors as knn
import sklearn.datasets as ds
import sklearn.model_selection as ms
import sklearn.neighbors as knn

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV


# Clean the data

##### Look for any obvious gaps, missing data, or outliers

In [158]:
#checked data completeness - looks like there are no nulls
print(df.count())

school        649
sex           649
age           649
address       649
famsize       649
Pstatus       649
Medu          649
Fedu          649
Mjob          649
Fjob          649
reason        649
guardian      649
traveltime    649
studytime     649
failures      649
schoolsup     649
famsup        649
paid          649
activities    649
nursery       649
higher        649
internet      649
romantic      649
famrel        649
freetime      649
goout         649
Dalc          649
Walc          649
health        649
absences      649
G1            649
G2            649
G3            649
dtype: int64


In [159]:
#checking unique answers for drinking levels questions - was curioius if there would be any zeros

print(df['Dalc'].unique())
print(df['Walc'].unique())



[1 2 5 3 4]
[1 3 2 4 5]


## Alcohol Counsumption Variables

##### See what the counts look like for the alcohol-related variables

In [160]:
df['Walc_count']=df.groupby('Walc').size()


In [161]:
df['Dalc_count']=df.groupby('Dalc').size()

In [162]:
%matplotlib

plt.hist(x=df['Dalc'],bins=5, color='green', data=df['Dalc_count'])
plt.xlabel('Level of Weekday Drinking')
plt.ylabel('Total Students')
plt.title('Weekday Drinking Response Distribution')
plt.axis([1,5,1,450])
plt.grid
plt.show()



#plt.hist(x=df['Walc'],bins=5, color='blue', data=df['Walc_count'])

Using matplotlib backend: Qt5Agg


In [163]:
%matplotlib

plt.hist(x=df['Walc'],bins=5, color='blue', data=df['Walc_count'])
plt.xlabel('Level of Weekend Drinking')
plt.ylabel('Total Students')
plt.title('Weekend Drinking Response Distribution')
plt.axis([1,5,1,450])
plt.show()



#plt.hist(x=df['Walc'],bins=5, color='blue', data=df['Walc_count'])

Using matplotlib backend: Qt5Agg


In [164]:
#Created a variable that unifies weekday and weekend drinking, alc

df['alc']=df['Dalc']+df['Walc']


In [165]:
df.groupby('alc').size()

alc
2     241
3     116
4      99
5      73
6      50
7      32
8      17
9       6
10     15
dtype: int64

In [166]:
df['alc_count']=df.groupby('alc').size()

In [167]:
%matplotlib

plt.hist(x=df['Walc'],bins=5, color='red', data=df['Walc_count'])
plt.xlabel('Weekend and Weekday Drinking')
plt.ylabel('Total Students')
plt.title('Weekend + Weekday Drinking Response Distribution')
plt.axis([1,5,1,450])
plt.show()



Using matplotlib backend: Qt5Agg


## Age

##### Due to the fact that 20, 21, and 22 year-olds have 6, 2, and 1 student respectively, I will drop all three to avoid skewing the data due to factors relating to age that I can't easily account for, so I am going to filter out all ages where len(age) is less than or equal to 20.

In [168]:
df = df.groupby('age').filter(lambda x: len(x)>20)

In [169]:
#verify that the dataset now looks as I expect - only ages 15-19

df.groupby('age').size()

age
15    112
16    177
17    179
18    140
19     32
dtype: int64

In [170]:
df.groupby('age').size()

age
15    112
16    177
17    179
18    140
19     32
dtype: int64

## Check for Normality

### OUTCOME: 
Only G1 (one of three sets of grades) is normal. Will stay away from models that require a normal distribution.

In [171]:
normal=[]

mylistnotcategorical= ['age','failures','absences', 'G1', 'G2', 'G3']

for var in mylistnotcategorical:
    if stats.normaltest(df[var]).pvalue > 0.5:
        normal.append(var)
    print(normal)
    print(var + ": " + '\nNormal Test: {}'.format(stats.normaltest(df[var]).pvalue))

[]
age: 
Normal Test: 2.9739239935981005e-18
[]
failures: 
Normal Test: 2.550076851541849e-99
[]
absences: 
Normal Test: 3.8536119152460206e-64
['G1']
G1: 
Normal Test: 0.9226859823253339
['G1']
G2: 
Normal Test: 1.7906402177497228e-09
['G1']
G3: 
Normal Test: 1.5181025716180042e-25


##### Created a graph to view the distributions more visually

In [172]:
mylistnotcategorical= ['age','failures','absences', 'G1', 'G2', 'G3']


%matplotlib notebook
for var in mylistnotcategorical:
    df[var].plot(kind='density')



In [173]:
%matplotlib notebook

df.boxplot(column='Dalc', by='age')
plt.xlabel('Age')
plt.ylabel('Weekday Drinking')
plt.show()




In [174]:
%matplotlib notebook

df.boxplot(column='Dalc', by='Pstatus')




<matplotlib.axes._subplots.AxesSubplot at 0x143102d0>

In [175]:
%matplotlib notebook

df.boxplot(column='Walc', by='Pstatus')



<matplotlib.axes._subplots.AxesSubplot at 0x146ad650>

In [176]:
df.groupby('Walc')['Pstatus'].count()

Walc
1    244
2    148
3    119
4     85
5     44
Name: Pstatus, dtype: int64

In [177]:
df.groupby('age')['Dalc'].describe()

age       
15   count    112.000000
     mean       1.383929
     std        0.725911
     min        1.000000
     25%        1.000000
     50%        1.000000
     75%        2.000000
     max        5.000000
16   count    177.000000
     mean       1.395480
     std        0.798850
     min        1.000000
     25%        1.000000
     50%        1.000000
     75%        2.000000
     max        5.000000
17   count    179.000000
     mean       1.553073
     std        0.960588
     min        1.000000
     25%        1.000000
     50%        1.000000
     75%        2.000000
     max        5.000000
18   count    140.000000
     mean       1.564286
     std        1.026349
     min        1.000000
     25%        1.000000
     50%        1.000000
     75%        2.000000
     max        5.000000
19   count     32.000000
     mean       1.781250
     std        1.128355
     min        1.000000
     25%        1.000000
     50%        1.000000
     75%        3.000000
     max      

## Target Variables

Alc makes sense since it's the overall drinking indicator. I want to see if weekend and weekday drinking are different. It's also easier for some models to work with five categories instead of 9.

# Dummies

Create dummies and remove original variable

In [178]:
#figure out variables with type object to create dummies

df.dtypes  

school         object
sex            object
age             int64
address        object
famsize        object
Pstatus        object
Medu            int64
Fedu            int64
Mjob           object
Fjob           object
reason         object
guardian       object
traveltime      int64
studytime       int64
failures        int64
schoolsup      object
famsup         object
paid           object
activities     object
nursery        object
higher         object
internet       object
romantic       object
famrel          int64
freetime        int64
goout           int64
Dalc            int64
Walc            int64
health          int64
absences        int64
G1              int64
G2              int64
G3              int64
Walc_count    float64
Dalc_count    float64
alc             int64
alc_count     float64
dtype: object

In [179]:
needdummies = ['sex','address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian','schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'school']
delfordummies=[]

for var in needdummies:
    df = pd.concat([df ,pd.get_dummies(df[var], drop_first =True, prefix = "is_" + var)], axis=1)
    delfordummies.append(var)
    del df[var]

mylist=list(df.columns.values)

In [180]:
#Verify no object types left

df.dtypes

age                       int64
Medu                      int64
Fedu                      int64
traveltime                int64
studytime                 int64
failures                  int64
famrel                    int64
freetime                  int64
goout                     int64
Dalc                      int64
Walc                      int64
health                    int64
absences                  int64
G1                        int64
G2                        int64
G3                        int64
Walc_count              float64
Dalc_count              float64
alc                       int64
alc_count               float64
is_sex_M                  uint8
is_address_U              uint8
is_famsize_LE3            uint8
is_Pstatus_T              uint8
is_Mjob_health            uint8
is_Mjob_other             uint8
is_Mjob_services          uint8
is_Mjob_teacher           uint8
is_Fjob_health            uint8
is_Fjob_other             uint8
is_Fjob_services          uint8
is_Fjob_

In [181]:
#want to see what the values look like

for var in mylist:
    print var, ':', df[var].unique()

SyntaxError: invalid syntax (<ipython-input-181-2af4a0f45e64>, line 4)

In [None]:
df.describe

In [187]:
Xvars=list(df.columns.values)

Xvars.remove('alc')
Xvars.remove('Dalc')
Xvars.remove('Walc')

print(Xvars)

['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'health', 'absences', 'G1', 'G2', 'G3', 'Walc_count', 'Dalc_count', 'alc_count', 'is_sex_M', 'is_address_U', 'is_famsize_LE3', 'is_Pstatus_T', 'is_Mjob_health', 'is_Mjob_other', 'is_Mjob_services', 'is_Mjob_teacher', 'is_Fjob_health', 'is_Fjob_other', 'is_Fjob_services', 'is_Fjob_teacher', 'is_reason_home', 'is_reason_other', 'is_reason_reputation', 'is_guardian_mother', 'is_guardian_other', 'is_schoolsup_yes', 'is_famsup_yes', 'is_paid_yes', 'is_activities_yes', 'is_nursery_yes', 'is_higher_yes', 'is_internet_yes', 'is_romantic_yes', 'is_school_MS']


# Split Data into train, validation, and test sets

In [183]:
#split my data set into train, validation, and test

train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [184]:
#printed out counts to make sure it worked

print 'train: ', train['age'].count()
print 'validate: ', validate['age'].count()
print 'test: ', test['age'].count()

SyntaxError: invalid syntax (<ipython-input-184-d4a962f2c497>, line 3)

# Analysis 

In [None]:
scope1=['age','health', 'famrel','is_Pstatus_T', 'is_sex_M', 'failures',  'is_internet_yes', 'absences']

scope2=['Medu', 'Fedu','is_famsup_yes', 'is_higher_yes', 'is_famsize_LE3', 'is_guardian_other']

scope3=['is_reason_reputation', 'is_activities_yes', 'failures', 'absences', 'goout', 'is_romantic_yes', 'traveltime']

scope4=['is_activities_yes', 'is_Pstatus_T', 'G1', 'is_internet_yes', 'failures']

scope5=['age', 'is_sex_M', 'is_internet_yes', 'famrel', 'is_higher_yes']

scope6=['age', 'failures', 'absences', 'is_higher_yes', 'is_nursery_yes', 'is_Mjob_health', 'is_Fjob_health']

I tried some groupings of factors that I would expect would relate to each other, to see if the score for the model was better that way.

In [None]:
#Personal Relationships
scope6=[ 'famrel', 'is_famsize_LE3', 'is_Pstatus_T', 'is_guardian_other', 'is_guardian_mother', 'is_famsup_yes', 
        'is_romantic_yes']

#Allocation of time
scope7=[ 'traveltime', 'studytime','freetime', 'absences',]

#Educational Resources
scope8=['absences','failures','is_reason_reputation','is_schoolsup_yes', 'is_famsup_yes', 'is_internet_yes',]

#Motivation
scope9=[ 'studytime','failures','absences', 'G1', 'G2', 'G3', 'is_activities_yes','is_higher_yes']

#Socio-economic status
scope10=[ 'Medu', 'Fedu', 'traveltime', 'is_address_U','is_reason_home', 'is_paid_yes', 'is_nursery_yes', 
         'is_internet_yes']

#Personal identifying traits
scope11=['age', 'health',  'is_sex_M', 'is_school_MS']

#Emotional and Personal Support
scope12=['famrel', 'goout','is_schoolsup_yes', 'is_famsup_yes','is_Pstatus_T']

#Parental influence
scope13=['is_Mjob_health', 'is_Mjob_other', 'is_Mjob_services','is_famsize_LE3', 'is_Pstatus_T', 
         'is_Mjob_teacher','is_Fjob_health','is_Fjob_other','is_Fjob_services','is_Fjob_teacher']

### KNeighbors Classifier

In [None]:
model = GridSearchCV(estimator= knn.KNeighborsClassifier(),
                     cv=5,
                     param_grid={'n_neighbors': range(1,10)}, 
                     scoring= 'accuracy')

y=train['Dalc']

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Dalc'] == 0) / float(len(train['Dalc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)
    


In [None]:
model = GridSearchCV(estimator= knn.KNeighborsClassifier(),
                     cv=5,
                     param_grid={'n_neighbors': range(1,10)}, 
                     scoring= 'accuracy')

y=train['Walc']

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Walc'] == 0) / float(len(train['Walc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Walc'] == 0) / float(len(validate['Walc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)
    


In [None]:
model = GridSearchCV(estimator= knn.KNeighborsClassifier(),
                     cv=5,
                     param_grid={'n_neighbors': range(1,10)}, 
                     scoring= 'accuracy')

y=train['alc']

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['alc'] == 0) / float(len(train['alc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['alc'] == 0) / float(len(validate['alc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)
    


### RamdomForestClassifier

In [None]:
scopelist=[scope1, scope2, scope3, scope4, scope5, scope6, scope7, scope8, scope9, scope10, scope11, scope12, scope13]

In [None]:
y=train['Dalc']

model = ms.GridSearchCV(RandomForestClassifier(),
                        param_grid={'n_estimators': [10, 100],
                                    'min_samples_split': [50, 100]},
                        cv=ms.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0),
                        scoring='neg_log_loss')

modeldata=[]

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Dalc'] == 0) / float(len(train['Dalc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

In [None]:
y=train['Dalc']

model = ms.GridSearchCV(RandomForestClassifier(),
                        param_grid={'n_estimators': [10, 100],
                                    'min_samples_split': [50, 100]},
                        cv=ms.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0),
                        scoring='neg_log_loss')

modeldata=[]

#Used this code below to test the importance of features in multiple scopes and get a sense of what variables
#Keep popping up. I created some scopes using those and re-ran the model to try and get the most predictive items

X=scope10
model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata

features = X
feature_importances = model.best_estimator_.feature_importances_

    
features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

features_df.head()


In [None]:
# higher = more important

for i in range(13):


In [None]:
y=train['Walc']

model = ms.GridSearchCV(RandomForestClassifier(),
                        param_grid={'n_estimators': [10, 100],
                                    'min_samples_split': [50, 100]},
                        cv=ms.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0),
                        scoring='neg_log_loss')

modeldata=[]

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Walc'] == 0) / float(len(train['Walc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Walc'] == 0) / float(len(validate['Walc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

In [None]:
y=train['alc']

model = ms.GridSearchCV(RandomForestClassifier(),
                        param_grid={'n_estimators': [10, 100],
                                    'min_samples_split': [50, 100]},
                        cv=ms.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0),
                        scoring='neg_log_loss')

modeldata=[]

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['alc'] == 0) / float(len(train['alc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['alc'] == 0) / float(len(validate['alc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

### DecisionTreeClassifier

In [None]:

#model=ms.GridSearchCV(DecisionTreeClassifier(), 
      #                {'max_depth': range(2, 10,1), 'min_samples_leaf': range(5,25,5)},scoring='neg_log_loss')

#tried using all variables
model=DecisionTreeClassifier()
model.fit(train[Xvars], train['Dalc'])

acc = sum(model.predict(train[Xvars]) - train['Dalc'] == 0) / float(len(train['Dalc']))
print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)


#now trying with scope loop
y=train['Dalc']

for i in range(13):
    X=scopelist[i] 
    print X
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Dalc'] == 0) / float(len(train['Dalc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)
    
    
    
# Fits the model
#model.fit(X, y)

In [None]:

model=ms.GridSearchCV(DecisionTreeClassifier(), 
                      {'max_depth': range(2, 10,1), 'min_samples_leaf': range(5,25,5)},scoring='neg_log_loss')

#tried using all variables
model.fit(train[scope4], train['Dalc'])

acc = sum(model.predict(train[scope4]) - train['Dalc'] == 0) / float(len(train['Dalc']))
print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[scope4]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

In [None]:

model=ms.GridSearchCV(DecisionTreeClassifier(), 
                      {'max_depth': range(2, 10,1), 'min_samples_leaf': range(5,25,5)},scoring='neg_log_loss')

#tried using all variables
model.fit(train[Xvars], train['Walc'])

acc = sum(model.predict(train[Xvars]) - train['Walc'] == 0) / float(len(train['Walc']))
print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['Walc'] == 0) / float(len(validate['Walc']))
print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

#now trying with scope loop
y=train['Walc']

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Walc'] == 0) / float(len(train['Walc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Walc'] == 0) / float(len(validate['Walc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)
    
    
    
# Fits the model
#model.fit(X, y)

In [None]:

model=ms.GridSearchCV(DecisionTreeClassifier(), 
                      {'max_depth': range(2, 10,1), 'min_samples_leaf': range(5,25,5)},scoring='neg_log_loss')

#tried using all variables
model.fit(train[Xvars], train['Walc'])

acc = sum(model.predict(train[Xvars]) - train['Walc'] == 0) / float(len(train['Walc']))
print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['Walc'] == 0) / float(len(validate['Walc']))
print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

#now trying with scope loop

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Walc'] == 0) / float(len(train['Walc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Walc'] == 0) / float(len(validate['Walc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)
    
    
    
# Fits the model
#model.fit(X, y)

### Adaboost

Used entire dataset only - trying the scope values, the models were at 20% correcteness, which makes sense based on what this model is usually doing

In [189]:
model=AdaBoostClassifier()
model.fit(train[Xvars], train['Dalc'])


acc = sum(model.predict(train[Xvars]) - train['Dalc'] == 0) / float(len(train['Dalc']))
print('Train % Correct: {:0.1f}'.format(acc * 100.))
acc = sum(model.predict(validate[Xvars]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
print('Validate % Correct: {:0.1f}'.format(acc * 100.))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [154]:
model=AdaBoostClassifier()
model.fit(train[Xvars], train['Walc'])


acc = sum(model.predict(train[Xvars]) - train['Walc'] == 0) / float(len(train['Walc']))
print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['Walc'] == 0) / float(len(validate['Walc']))
print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

SyntaxError: invalid syntax (<ipython-input-154-bb8c4d105495>, line 6)

In [109]:
model=AdaBoostClassifier()
model.fit(train[Xvars], train['alc'])


acc = sum(model.predict(train[Xvars]) - train['alc'] == 0) / float(len(train['alc']))
print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['alc'] == 0) / float(len(validate['alc']))
print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)

12 Train % Correct: 27.9
12 Validate % Correct: 25.0


### BaggingClassifier

In [189]:
model = BaggingClassifier()
model.fit(train[Xvars], train['Dalc'])

acc = sum(model.predict(train[Xvars]) - train['Dalc'] == 0) / float(len(train['Dalc']))
print 'Xvars All - Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
print 'Xvars All Validate % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(test[Xvars]) - test['Dalc'] == 0) / float(len(test['Dalc']))
print 'Xxars All Test % Correct: {:0.1f}'.format(acc * 100.)

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Dalc'] == 0) / float(len(train['Dalc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Dalc'] == 0) / float(len(validate['Dalc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)


Xvars All - Train % Correct: 98.4
Xvars All Validate % Correct: 71.9
Xxars All Test % Correct: 64.1
0 Train % Correct: 51.3
0 Validate % Correct: 43.8
1 Train % Correct: 48.2
1 Validate % Correct: 40.6
2 Train % Correct: 48.2
2 Validate % Correct: 35.2
3 Train % Correct: 52.3
3 Validate % Correct: 43.0
4 Train % Correct: 56.5
4 Validate % Correct: 50.8
5 Train % Correct: 58.1
5 Validate % Correct: 46.9
6 Train % Correct: 54.4
6 Validate % Correct: 32.0
7 Train % Correct: 50.8
7 Validate % Correct: 46.9
8 Train % Correct: 48.7
8 Validate % Correct: 34.4
9 Train % Correct: 52.6
9 Validate % Correct: 46.1
10 Train % Correct: 53.4
10 Validate % Correct: 41.4
11 Train % Correct: 54.9
11 Validate % Correct: 44.5
12 Train % Correct: 56.5
12 Validate % Correct: 50.8


In [190]:
model = BaggingClassifier()
model.fit(train[Xvars], train['alc'])

acc = sum(model.predict(train[Xvars]) - train['alc'] == 0) / float(len(train['alc']))
print 'Xvars All - Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['alc'] == 0) / float(len(validate['alc']))
print 'Xvars All Validate % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(test[Xvars]) - test['alc'] == 0) / float(len(test['alc']))
print 'Xxars All Test % Correct: {:0.1f}'.format(acc * 100.)

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['alc'] == 0) / float(len(train['alc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['alc'] == 0) / float(len(validate['alc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)


Xvars All - Train % Correct: 97.9
Xvars All Validate % Correct: 32.0
Xxars All Test % Correct: 32.8
0 Train % Correct: 1.8
0 Validate % Correct: 22.7
1 Train % Correct: 7.8
1 Validate % Correct: 10.2
2 Train % Correct: 4.2
2 Validate % Correct: 15.6
3 Train % Correct: 7.3
3 Validate % Correct: 10.9
4 Train % Correct: 5.2
4 Validate % Correct: 13.3
5 Train % Correct: 6.8
5 Validate % Correct: 10.9
6 Train % Correct: 4.4
6 Validate % Correct: 15.6
7 Train % Correct: 5.2
7 Validate % Correct: 9.4
8 Train % Correct: 0.8
8 Validate % Correct: 15.6
9 Train % Correct: 4.2
9 Validate % Correct: 10.9
10 Train % Correct: 8.9
10 Validate % Correct: 18.0
11 Train % Correct: 7.0
11 Validate % Correct: 10.2
12 Train % Correct: 6.0
12 Validate % Correct: 10.2


In [191]:
model = BaggingClassifier()
model.fit(train[Xvars], train['Walc'])

acc = sum(model.predict(train[Xvars]) - train['Walc'] == 0) / float(len(train['Walc']))
print 'Xvars All - Train % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(validate[Xvars]) - validate['Walc'] == 0) / float(len(validate['Walc']))
print 'Xvars All Validate % Correct: {:0.1f}'.format(acc * 100.)
acc = sum(model.predict(test[Xvars]) - test['Walc'] == 0) / float(len(test['Walc']))
print 'Xxars All Test % Correct: {:0.1f}'.format(acc * 100.)

for i in range(13):
    X=scopelist[i] 
    model.fit(train[X],y)
    #print "best score: ", model.best_score_
    #print "best params: ", model.best_params_
    
    #modeldata.append(model.best_score_)
    #print modeldata
    acc = sum(model.predict(train[X]) - train['Walc'] == 0) / float(len(train['Walc']))
    print i, 'Train % Correct: {:0.1f}'.format(acc * 100.)
    acc = sum(model.predict(validate[X]) - validate['Walc'] == 0) / float(len(validate['Walc']))
    print i, 'Validate % Correct: {:0.1f}'.format(acc * 100.)


Xvars All - Train % Correct: 98.4
Xvars All Validate % Correct: 32.0
Xxars All Test % Correct: 30.5
0 Train % Correct: 87.5
0 Validate % Correct: 24.2
1 Train % Correct: 52.3
1 Validate % Correct: 27.3
2 Train % Correct: 80.5
2 Validate % Correct: 29.7
3 Train % Correct: 52.3
3 Validate % Correct: 29.7
4 Train % Correct: 53.1
4 Validate % Correct: 29.7
5 Train % Correct: 54.2
5 Validate % Correct: 33.6
6 Train % Correct: 69.0
6 Validate % Correct: 27.3
7 Train % Correct: 57.6
7 Validate % Correct: 33.6
8 Train % Correct: 95.1
8 Validate % Correct: 25.0
9 Train % Correct: 68.0
9 Validate % Correct: 27.3
10 Train % Correct: 51.6
10 Validate % Correct: 26.6
11 Train % Correct: 54.2
11 Validate % Correct: 32.8
12 Train % Correct: 46.6
12 Validate % Correct: 30.5


In [179]:
features=X
feature_importances = model.best_estimator_.feature_importances_
features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

features_df.head()

Unnamed: 0,Features,Importance Score
0,Medu,0.224713
2,traveltime,0.1812
1,Fedu,0.160568
3,is_address_U,0.135396
6,is_nursery_yes,0.123131


In [None]:
GradientBoostingClassifier	sklearn	Gradient-boosted ensemble of decision trees

In [None]:
LinearDiscriminantAnalysis	sklearn	Classification along the axis of maximum class separability

In [None]:
QuadraticDiscriminantAnalysis	sklearn	Classification along the axis of maximum class separability

In [None]:
LinearSVC	sklearn	Linear support vector machine

In [None]:
SVC	sklearn	SVM with nonlinear kernel

In [None]:
LogisticRegression	sklearn	Regress probability of being in class

This would only work if I create a binary alcohol-related variable, e.g. high drinker (1) and low drinker (0)

y = df['interest_level'].apply(lambda x: 1 if x == 'low' else 2 if x == 'medium' else 3)