# Regression Case Study

## Gathering Data

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

num = pd.read_csv('numerical.csv')
cat = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')
data = pd.concat([num, cat, targets], axis = 1)
data.head()

Unnamed: 0.2,Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,...,SOLIH,VETERANS,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,Unnamed: 0.1,TARGET_B,TARGET_D
0,0,8901,0,3712,60.0,5.0,,0,0,39,...,,,L,E,C,T,2,0,0,0.0
1,1,9401,1,5202,46.0,6.0,9.0,16,0,15,...,,,L,G,A,S,1,1,0,0.0
2,2,9001,1,0,61.611649,3.0,1.0,2,0,20,...,,,L,E,C,R,2,2,0,0.0
3,3,8701,0,2801,70.0,1.0,4.0,2,0,23,...,,,L,E,C,R,2,3,0,0.0
4,4,8601,0,2001,78.0,3.0,2.0,60,1,28,...,12.0,,L,F,A,S,2,4,0,0.0


# Reducing Features

In [20]:
to_drop_columns = list()

In [21]:
null_col=data.columns[data.isna().any()].tolist()
null_col

['WEALTH1', 'MSA', 'ADI', 'DMA', 'NEXTDATE', 'OSOURCE', 'SOLIH', 'VETERANS']

In [22]:
to_drop_columns.append(['SOLIH','VETERANS'])

In [23]:
to_drop_columns.append(['Unnamed: 0','DOB'])

In [24]:

to_drop_columns.append(['ODATEDW','TCODE'])


In [25]:
to_drop_columns.append('ZIP')
to_drop_columns.append('OSOURCE')

# dropping columns with more than 85% NaN values

In [37]:
data['WEALTH1'].isna().sum() / len(data['WEALTH1']) > .85

False

In [35]:
for column in data.columns:
    if data[column].isna().sum() / len(data[column]) > .85:
        if column not in to_drop_columns:
            to_drop_columns.append(column)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
data = data.drop(columns = to_drop_columns)

In [None]:
data.head()

In [None]:
null_col

In [None]:
data['GEOCODE2'] = data['GEOCODE2'].replace({' ': np.NaN})
data['GEOCODE2'] = data['GEOCODE2'].fillna(data['GEOCODE2'].mode()[0])

In [None]:
data['WEALTH1'].isna()


In [None]:
#data['WEALTH1'] = data['WEALTH1'].replace({' ': np.NaN})
data['WEALTH1'] = data['WEALTH1'].fillna(data['WEALTH1'].mode()[0])

In [None]:
data['ADI'] = data['ADI'].fillna(np.ceil(data['ADI'].mean()))

In [None]:
data['DMA'] = data['DMA'].fillna(np.ceil(data['DMA'].mean()))

In [None]:
data['MSA'] = data['MSA'].fillna(np.ceil(data['MSA'].mean()))

In [None]:
data.isna().sum()

### Target is imbalanced.  Balancing Data

In [None]:
data['TARGET_B'].value_counts()

In [None]:
category_0 = data[data['TARGET_B']==0].sample(len(data[data['TARGET_B']==1])) #
print(category_0.shape)

category_1 = data[data['TARGET_B']== 1 ]
print(category_1.shape)

data = pd.concat([category_0, category_1], axis = 0)
data = data.sample(frac =1)
data = data.reset_index(drop=True)

print(data.shape)


In [None]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)


numericalX = X.select_dtypes(np.number)
categorcalX = X.select_dtypes(object)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categorcalX)
encoded_categorical = encoder.transform(categorcalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train

In [None]:
X_r = data[data['TARGET_B']]==1,data.drop(['TARGET_D'], axis = 1)
y_r = data[data['TARGET_B']]==1,['TARGET_D']

numericalX = X_r.select_dtypes(np.number)
categorcalX = X_r.select_dtypes(np.object)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categorcalX)
encoded_categorical = encoder.transform(categorcalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X_r = pd.concat([numericalX, encoded_categorical], axis = 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_r, y_r, test_size=0.2, random_state=0)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)
print("Train score: ", clf.score(X_train, y_train))
print("Test Score: ",clf.score(X_test, y_test))

In [None]:
# For cross validation
from sklearn.model_selection import cross_val_score
clf = RandomForestRegressor(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Mean cross validation score: ",np.mean(cross_val_scores))