
                
# Problem 3: Somerville Happiness Index  
                

            



## loading dataset in pandas

In [None]:
import pandas as pd
df_hap = pd.read_csv('https://data.somervillema.gov/api/views/yevj-2b33/rows.csv',delimiter=',', header=0,na_values=[" "])


## checking the number of rows and columns in the dataframe

In [None]:
df_hap.shape

## counting the null values in each of the dataframe columns

In [None]:
df_hap.isnull().sum()

## drop columns where missing data is more than 50 percent

In [None]:
df_hap_rmcol=df_hap[df_hap.columns[df_hap.isnull().mean() < 0.5]]

## as we see below we are left only with 17 columns after the removal of missing data columns

In [None]:
df_hap_rmcol.shape

## checking the sum of null values in the updated data

In [None]:
df_hap_rmcol.isnull().sum()

## column transformation : cleaning and making column names more understandable

In [None]:
df_hap_rmcol.rename(columns={'How.happy.do.you.feel.right.now.': 'happiness_score', 
                             'How.satisfied.are.you.with.your.life.in.general.': 'satisfaction_general',
                             'How.satisfied.are.you.with.Somerville.as.a.place.to.live.': 'satisfaction_somerville',
                             'In.general..how.similar.are.you.to.other.people.you.know._2011': 'similarity_2011',
                             'When.making.decisions..are.you.more.likely.to.seek.advice.or.decide.for.yourself._2011.':'decision_making',
                             'The.availability.of.affordable.housing_2011':'housing_availability_2011',
                             'How.would.you.rate.the.following..The.overall.quality.of.public.schools.in.your.community._2011':'school_quality_2011',
                             'How.would.you.rate.the.following..The.beauty.or.physical.setting_2011':'beauty_physical_setting',
                             'How.would.you.rate.the.following..The.effectiveness.of.the.local.police_2011_2013':'police_effectiveness',
                             'What.is.your.gender._2011':'gender_2011',
                             'Age.':'age',
                             'Marital.status._2011':'marital_status_2011',
                             'What.is.your.race_2011_2013':'race_2011_2013',
                             'How.long.have.you.lived.here.':'living_time',
                             'What.is.your.annual.household.income.':'income'}, inplace=True)

## checking the updated column names

In [None]:
df_hap_rmcol.columns.tolist()

## imputation : here we impute the null column values with the most frequent value of that respective column

In [None]:
df_imputed = df_hap_rmcol.apply(lambda x:x.fillna(x.value_counts().index[0]))

## after the cleaning we have no null values in the data ! :)

In [None]:
df_imputed.isnull().sum()

## checking the first 10 rows of the clean data

In [None]:
df_imputed.head(5)

## creating the dummy variables for achieving the one hot encoding of categorical columns using get_dummies


In [None]:
df_dummies = pd.get_dummies(df_imputed, columns=['gender_2011',
 'age',
 'marital_status_2011',
 'race_2011_2013',
 'living_time',
 'income'])

## checking columns after the one hot encoding of categorical columns

In [None]:
df_dummies.columns.tolist()

## removing the non-numeric noisy data from the happiness_score column as it is our dependent variable

In [None]:
df_numeric = df_dummies[df_dummies.happiness_score.apply(lambda x: x.isnumeric())]

## making sure all non-numeric data is removed

In [None]:
df_numeric['happiness_score'].unique()

In [None]:
df_numeric.to_csv('num.csv', sep=',', encoding='utf-8')
df_num = pd.read_csv('num.csv',delimiter=',', header=0,na_values=[" "])

## function to creat the happiness score value as 0: unhappy and 1: happy
## people with happiness score < 6 are unhappy and people with score > 6 are happy

In [None]:
def category(row):
    if row['happiness_score'] < 6:
        val = '0'
    else:
        val = '1'
    return val


df_num['hapiness_bin'] = df_num.apply(category, axis=1)

## dropping the actual happiness score column since we created a new binary column with values 0 and 1

In [None]:
df_num1 = df_num.drop(df_num['happiness_score'])

In [None]:
import numpy as np

## since some of the columns are still having some noisy data we get rid of those and retain only the numeric data

In [None]:
data_columns = df_num1.columns.tolist()
df_num2 = (df_num1.drop(data_columns, axis=1)
         .join(df_num1[data_columns].apply(pd.to_numeric, errors='coerce')))

df_num2 = df_num2[df_num2[data_columns].notnull().all(axis=1)]

## finally we check the distribution of happy vs non-happy people

In [None]:
df_num2['hapiness_bin'].value_counts()

## creating the X and y variables
## X contains all variables except the dependent variable
## y contains our dependent variable 'happiness_bin'

In [None]:
X= df_num2.drop(['hapiness_bin'], axis=1)
y = df_num2['hapiness_bin']

## after this we split the data into training and testing data with a 75 and 25 train test split
## we create a logistic regression classifier
## then we fit the logistic regressor on the training data with class_weight = 'balanced' since we saw that we have imbalanaced dataset
## after that we predict on the test data
## finally we compute the auc score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)
mul_lr_balanced = linear_model.LogisticRegression(class_weight='balanced')
mul_lr_balanced.fit(X_train, y_train)
#print ("accuracy:",mul_lr_balanced.score(X_test, y_test))   
y_pred = mul_lr_balanced.predict(X_test)
roc_auc_score(y_pred, y_test)

## checking number of 1's and 0's to make sure model is not predicting only 1 for all records 

In [None]:
 df_pred['pred']= pd.DataFrame(y_pred)

In [None]:
df_pred['pred'].value_counts()

## similarly creating a decision classifier 
## fitting it on training data
## checking the accuracy

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)
        
decision_tree_classifier = DecisionTreeClassifier(class_weight='balanced')
        
decision_tree_classifier.fit(X_train, y_train)

print ("accuracy:",decision_tree_classifier.score(X_test, y_test))


## Now we want to implement a multi-class logit 
## so we create 3 categories of happiness : 1: unhappy, 2: somewhat happy 3: very happy
## for that we create a fucntion and make a new column with these 3 categories

In [None]:
# 3 categories
def category3(row):
    if row['happiness_score'] < 4:
        val = '1'
    elif row['happiness_score'] < 7:
        val = '2'
    else:
        val = '3'
    return val


df_num2['hapiness_bin3'] = df_num.apply(category3, axis=1)

In [None]:
X= df_num2.drop(['hapiness_bin3'], axis=1)
y = df_num2['hapiness_bin3']

## again we check the distribution of all the three categories

In [None]:
y.value_counts()

## now we do the same but here we do multiclass logit 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
mul_lr_balanced = linear_model.LogisticRegression(multi_class='multinomial',class_weight='balanced',solver='newton-cg')
mul_lr_balanced.fit(X_train, y_train)
a = mul_lr_balanced.score(X_test, y_test)
print(a)

In [None]:
#this is assertion block which verifies the solution.

try:
    def verify_answer():

        if a > 0.9:
            return True
        else:
            return False

    ref_assert_var = verify_answer()
except Exception as e:
    print('Your assertion block throws error: ' + str(e))
else:
    if ref_assert_var:
        print('looks good :)')
    else:
        print('The answer did not pass the test. Try again')


## we then run a decision tree as well on the multiclass data

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)
        
decision_tree_classifier = DecisionTreeClassifier(class_weight='balanced')
        
decision_tree_classifier.fit(X_train, y_train)

print ("accuracy:",decision_tree_classifier.score(X_test, y_test))
