In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math, os
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [2]:
df = pd.read_csv('interpolated_full.csv')
del df['Unnamed: 0']

def setCategoryValue(x): # takes an input of 'threshold' 
    if x <= 10: 
        return 0
    return 1 

df['electric_category'] = df['Percentage Electrified'].apply(lambda x: setCategoryValue(x))
df.shape

(33391, 346)

In [3]:
df0 = df[df.electric_category == 0]
df1 = df[df.electric_category == 1]

print (df1.shape, df0.shape)

df0_upsampled = resample(df0, 
                        replace=True,     # sample with replacement
                        n_samples=31191,  # to match majority class
                        random_state=123) # reproducible results

df = pd.concat([df1, df0_upsampled])
df.electric_category.value_counts()

(31191, 346) (2200, 346)


1    31191
0    31191
Name: electric_category, dtype: int64

In [4]:
df, test_data_df = train_test_split(df, test_size=0.2)
print (df.shape, test_data_df.shape)

(49905, 346) (12477, 346)


In [5]:
scaler = MinMaxScaler(feature_range=(0, 1))
#scaler = StandardScaler()
X_training = df.drop(['Census 2011 ID', 'Percentage Electrified', 'Number of Electrified Households', 
                      'Village Name','District Name','State Name', 'electric_category'], axis = 1)
X_training_scaled = pd.DataFrame(scaler.fit_transform(X_training), columns = X_training.columns)

X_testing = test_data_df.drop(['Census 2011 ID', 'Percentage Electrified', 'Number of Electrified Households', 
                      'Village Name','District Name','State Name', 'electric_category'], axis = 1)

Y_training = df[['electric_category']].values
Y_testing = test_data_df[['electric_category']].values

X_training_scaled.head(5)

Unnamed: 0,Number of Households,min,10th_percentile,median,90th_percentile,max,mean,st_dev,sum,area,...,rain_mar_90th,rain_apr_90th,rain_may_90th,rain_jun_90th,rain_jul_90th,rain_aug_90th,rain_sep_90th,rain_oct_90th,rain_nov_90th,rain_dec_90th
0,0.048404,0.004566,0.004362,0.004108,0.005943,0.006415,0.004592,0.003742488,0.021215,0.112871,...,0.257866,0.118137,0.612331,0.274918,0.419732,0.327043,0.273406,0.295655,0.001148,0.211806
1,0.013311,0.01552,0.012605,0.009064,0.00843,0.008331,0.009664,1.606987e-09,0.001335,0.023762,...,0.221399,0.152799,0.591201,0.331829,0.185875,0.393982,0.300419,0.40777,0.0,0.037166
2,0.014521,0.002821,0.003189,0.002658,0.003061,0.003788,0.002927,0.001614199,0.014008,0.152475,...,0.297777,0.295558,0.28089,0.273932,0.271198,0.336937,0.579364,0.15538,0.081115,1.0
3,0.022689,0.013896,0.01128,0.01009,0.026046,0.044538,0.014842,0.01806593,0.022501,0.106931,...,0.552202,0.056031,0.193505,0.322197,0.394177,0.421875,0.431418,0.522783,0.0,0.357253
4,0.015731,0.004345,0.003556,0.00285,0.00294,0.003716,0.003042,0.001156303,0.008977,0.112871,...,0.165402,0.003987,0.419153,0.115023,0.165628,0.151095,0.377437,0.247876,0.0,0.0


In [None]:
clf = svm.SVC()
clf.fit(X_training_scaled.values, Y_training.ravel()) 

In [None]:
test_data_df['predictions'] = clf.predict(X_testing)
test_data_df.shape

In [None]:
test_data_df.head(10)

In [None]:
accuracy_score(Y_testing, test_data_df['predictions'])

In [None]:
# .934 with minmax scaler 