# Logistic Regression with Scikit-Learn

In [1]:
import numpy as np
import pandas as pd

In [12]:
data_url = "http://www-stat.wharton.upenn.edu/~waterman/DataSets/uva.txt"
dataframe = pd.read_table(data_url)

In [13]:
dataframe[:5]

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr


In [14]:
dataframe.shape

(19583, 11)

In [15]:
dataframe.dtypes

who                      object
Newbie                    int64
Age                     float64
Gender                   object
Household Income         object
Sexual Preference        object
Country                  object
Education Attainment     object
Major Occupation         object
Marital Status           object
Years on Internet        object
dtype: object

In [16]:
dataframe["Marital Status"].unique()

array(['Other', 'Married', 'Single', 'Divorced', 'Widowed', nan,
       'Separated'], dtype=object)

In [17]:
dataframe.pop('who') 
dataframe.pop('Country')
dataframe.pop('Years on Internet')

dataframe.dtypes

Newbie                    int64
Age                     float64
Gender                   object
Household Income         object
Sexual Preference        object
Education Attainment     object
Major Occupation         object
Marital Status           object
dtype: object

In [18]:
for col in ['Household Income', 'Sexual Preference', 'Education Attainment', 'Major Occupation', "Marital Status"]:
    dataframe[col] = dataframe[col].astype('category')
dataframe.dtypes

Newbie                     int64
Age                      float64
Gender                    object
Household Income        category
Sexual Preference       category
Education Attainment    category
Major Occupation        category
Marital Status          category
dtype: object

In [19]:
dataframe.head()

Unnamed: 0,Newbie,Age,Gender,Household Income,Sexual Preference,Education Attainment,Major Occupation,Marital Status
0,0,54.0,Male,$50-74,Gay male,Some College,Computer,Other
1,0,39.0,Female,Over $100,Heterosexual,Professional,Other,Other
2,1,49.0,Female,$40-49,Heterosexual,Some College,Management,Other
3,1,22.0,Female,$40-49,Heterosexual,Some College,Computer,Married
4,0,20.0,Male,$30-39,Bisexual,Some College,Education,Single


In [23]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
dataframe_modified = pd.get_dummies(dataframe)
dataframe_modified[:5]

Unnamed: 0,Newbie,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,54.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,39.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,49.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,22.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,20.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
dataframe_modified.shape

(19583, 38)

In [25]:
dataframe_modified.isnull().sum()

Newbie                                 0
Age                                  561
Gender_Female                          0
Gender_Male                            0
Household Income_$10-19                0
Household Income_$20-29                0
Household Income_$30-39                0
Household Income_$40-49                0
Household Income_$50-74                0
Household Income_$75-99                0
Household Income_Over $100             0
Household Income_Under $10             0
Sexual Preference_Bisexual             0
Sexual Preference_Gay male             0
Sexual Preference_Heterosexual         0
Sexual Preference_Lesbian              0
Sexual Preference_Transgender          0
Sexual Preference_na                   0
Education Attainment_College           0
Education Attainment_Doctoral          0
Education Attainment_Grammar           0
Education Attainment_High School       0
Education Attainment_Masters           0
Education Attainment_Other             0
Education Attain

In [38]:
dataframe_modified.loc[
    pd.isnull(dataframe_modified['Age']), "Age"] = dataframe_modified['Age'].mean()

In [39]:
dataframe_modified['Age'][pd.isnull(dataframe_modified['Age'])]

Series([], Name: Age, dtype: float64)

In [40]:
dataframe_modified.head()

Unnamed: 0,Newbie,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,54.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,39.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,49.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,22.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,20.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:
x_data = dataframe_modified.iloc[:, 1:].as_matrix()
y_data = dataframe_modified.iloc[:, 0].as_matrix().reshape(-1, 1)
y_data.shape, x_data.shape

((19583, 1), (19583, 37))

In [44]:
from sklearn import preprocessing # Min-Max Standardzation

min_max_scaler = preprocessing.MinMaxScaler()
x_data = min_max_scaler.fit_transform(x_data)

(19583, 37)

In [46]:
import numpy as np 

training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
y_training, y_test = y_data[training_idx,:], y_data[test_idx,:]

x_training.shape, x_test.shape

((15666, 37), (3916, 37))

In [47]:
from sklearn import linear_model, datasets

logreg = linear_model.LogisticRegression(fit_intercept=True)
logreg.fit(x_training, y_training.flatten())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
logreg.predict(x_test[:5])

array([1, 0, 0, 0, 0], dtype=int64)

In [50]:
logreg.predict_proba(x_test[:5])

array([[ 0.42964178,  0.57035822],
       [ 0.76990579,  0.23009421],
       [ 0.83668251,  0.16331749],
       [ 0.75340325,  0.24659675],
       [ 0.90584214,  0.09415786]])

In [51]:
x_test.shape

(3916, 37)

In [52]:
y_test.shape

(3916, 1)

In [53]:
logreg.predict(x_test) == y_test.flatten()

array([ True,  True,  True, ..., False,  True, False], dtype=bool)

In [54]:
sum(logreg.predict(x_test) == y_test.flatten()) / len(y_test)

0.76430030643513791

In [55]:
logreg.predict_proba(x_test)

array([[ 0.42964178,  0.57035822],
       [ 0.76990579,  0.23009421],
       [ 0.83668251,  0.16331749],
       ..., 
       [ 0.76686911,  0.23313089],
       [ 0.95217496,  0.04782504],
       [ 0.50331933,  0.49668067]])

In [56]:
logreg.decision_function(x_test[:5])

array([ 0.28331282, -1.20777934, -1.63374856, -1.11684619, -2.26389237])

In [57]:
logreg.predict(x_test[:5])

array([1, 0, 0, 0, 0], dtype=int64)