# PC013 Bloomberg - Improving Indian Farming - Model Selection

### Imports

In [239]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [126]:
loc = '../data/'
rainfall=pd.read_csv(loc + 'rainfallyearly.csv', thousands=',')
average_gdp=pd.read_csv(loc + 'AnnualStateGDP2018.csv', thousands=',')
loan_default_rates=pd.read_csv(loc + 'loan_default_rates.csv', thousands=',')
outstanding_liabilities=pd.read_csv(loc + 'OutstandingLiabilitiesbyState20162017.csv', thousands=',')
roads=pd.read_csv(loc + 'Roads_2013.csv',encoding='utf8', engine='python', thousands=',')
poverty_line=pd.read_csv(loc + 'Ruralpercentagebelowpovertyline20112012.csv', thousands=',')

### PreProcessing

In [127]:
poverty_line.columns=['State','Percent Below Poverty Line']

In [128]:
roads.columns=['State','National Highways','State Highways','PWD Roads','Urban Roads','Project Roads','Rural Roads']
roads['stripped']=roads['State'].apply(lambda x: x.strip())
roads['State']=roads['stripped']
roads.drop('stripped',inplace=True,axis=1)

In [129]:
outstanding_liabilities.columns=['State','Liabilities Billions of Rupees']

In [130]:
loan_default_rates.columns=['State','Year','NPA_Loans_Ratio','Recovery_Ratio']

In [131]:
average_gdp.columns=['State','Nominal GDP INR','Nominal GDP USD']

In [132]:
combined_data=average_gdp.merge(outstanding_liabilities)

In [133]:
combined_data=combined_data.merge(loan_default_rates[loan_default_rates['Year']==2016])

In [134]:
combined_data.drop('Year',inplace=True,axis=1)

In [135]:
combined_data=combined_data.merge(poverty_line)

In [136]:
combined_data = combined_data[combined_data.NPA_Loans_Ratio != 0]

In [137]:
X = combined_data[[u'Nominal GDP INR', u'Nominal GDP USD', u'Liabilities Billions of Rupees', u'Percent Below Poverty Line']]

In [138]:
y = combined_data[u'NPA_Loans_Ratio']

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1)

### Simple Linear Regression

In [244]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

In [245]:
lr_preds, y_test

(array([9.34371626]), 15    41.36
 Name: NPA_Loans_Ratio, dtype: float64)

### Ridge Regression

In [246]:
ridge = RidgeCV(alphas=(0.1,0.3,0.5,0.8,1), cv=3, normalize=True)
ridge.fit(X_train, y_train)
ridge_preds = ridge.predict(X_test)

In [247]:
ridge_preds, y_test

(array([30.31183491]), 15    41.36
 Name: NPA_Loans_Ratio, dtype: float64)

### Lasso Regression

In [248]:
lasso = LassoCV(alphas=(0.1,0.3,0.5,0.8,1), cv=3, normalize=True)
lasso.fit(X_train, y_train)
lasso_preds = lasso.predict(X_test)

In [249]:
lasso_preds, y_test

(array([28.59029943]), 15    41.36
 Name: NPA_Loans_Ratio, dtype: float64)

### Random Forest Regressor

In [250]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_preds = rfr.predict(X_test)

In [251]:
rfr_preds, y_test

(array([42.461]), 15    41.36
 Name: NPA_Loans_Ratio, dtype: float64)

### Bayesian Ridge Regressor

In [252]:
brr = BayesianRidge(normalize=True)
brr.fit(X_train, y_train)
brr_preds = brr.predict(X_test)

In [253]:
brr_preds, y_test

(array([27.61789334]), 15    41.36
 Name: NPA_Loans_Ratio, dtype: float64)