# Mercedez Benz Greener Manufacturing



**Problem Statement: -** Reduce the time that cars spend on the test bench.Analyze different permutations of features in a Mercedes-Benz car to predict the time it takes to pass testing. Optimal algorithms will contribute to faster testing, resulting in lower carbon dioxide emissions without reducing Daimler’s standards.

**Actions to be taken: -**
1. If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
2. Check for null and unique values for test and train sets
3. Apply label encoder.
4. Perform dimensionality reduction.
5. Predict your test_df values using xgboost



In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load datasets
df_train = pd.read_csv('train.csv',index_col ='ID')
df_test = pd.read_csv('test.csv',index_col = 'ID')
print('Training data shape:', df_train.shape)
print('Testing data shape :', df_test.shape)

Training data shape: (4209, 377)
Testing data shape : (4209, 376)


In [3]:
#  X columns -Train dataset
df_train_X = df_train.drop(['y'],axis=1)
print('Training data shape:', df_train_X.shape)

Training data shape: (4209, 376)


In [None]:
df_train_X.head(2)

In [None]:
df_test.head(2)

In [4]:
y = df_train[['y']]
y.shape

(4209, 1)

### 1. Remove columns which variance is equal to zero

In [None]:
#identify columns variance is equal to zero

cols_to_drop=[]

for col in df_train_X.columns:
    if df_train_X[col].dtype=='int64' and df_train_X[col].var()==0:
        cols_to_drop.append(col)
print('Below are 12 columns which variance is equal to Zero')
print(cols_to_drop) 

In [None]:
# drop columns variance is equal to zero
df_train_X.drop(columns = cols_to_drop,inplace =True)
print(df_train_X.shape)

In [None]:
#identify columns variance is equal to zero testing dataset

cols_to_drop_test=[]

for col in df_test.columns:
    if df_test[col].dtype=='int64' and df_test[col].var()==0:
        cols_to_drop_test.append(col)
print('Below are 5 columns which variance is equal to Zero')
print(cols_to_drop_test) 

In [None]:
# drop columns variance is equal to zero in test data set
df_test.drop(columns = cols_to_drop_test,inplace =True)
print(df_test.shape)

### 2. Check null and unique values of training and testing data set

In [None]:
# check missing values in training and testing data set

print('Total Null values in training data set: ',df_train_X.isnull().sum().sum())
print('Total Null values in testing data set: ',df_test.isnull().sum().sum())

In [None]:
# Column wise unique values in training data set

unique_train = df_train_X.nunique(dropna=False)
unique_val_train = unique_train.to_frame().reset_index().rename({'col':'count'}, axis='columns')
print ('Below are the column wise unique values in Training data set : ')
unique_val_train.T

In [None]:
unique_test = df_test.nunique(dropna=False)
unique_val_test = unique_test.to_frame().reset_index().rename({'col':'count'}, axis='columns')
print ('Below are the column wise unique values in Testing data set : ')
unique_val_test.T

### 3. Apply LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
cat_cols_train = [col for col in df_train_X.columns if df_train[col].dtype=='object']
cat_cols_test = [col for col in df_test.columns if df_test[col].dtype=='object']
print(cat_cols_train)
print(cat_cols_test)

In [None]:
# Apply label encoder on training dataset 

le=LabelEncoder()
for col in cat_cols_train:
    le.fit(df_train_X[col])
    df_train_X[col]=le.transform(df_train_X[col])
    

In [None]:
df_train_X.head(2)

In [None]:
df_test.head(2)

In [None]:
# Apply label encoder on testing dataset 

le=LabelEncoder()
for col in cat_cols_test:
    le.fit(df_test[col])
    df_test[col]=le.transform(df_test[col])

In [None]:
df_test.head(2)

In [None]:
df_train_X.describe()

In [None]:
df_test.describe()

In [None]:
y.describe()

### 4. Perform dimensionality reduction by using PCA

In [None]:
# train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df_train_X, y, train_size=0.8,test_size=0.2,random_state=42)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

In [None]:
from sklearn.decomposition import  PCA
pca=PCA()

In [None]:
X_train_pca = pd.DataFrame(pca.fit_transform(X_train))
plt.plot(np.cumsum(pca.explained_variance_ratio_))

In [None]:
#with n n_components=50
pca_1=PCA(n_components=50)

X_train_pca_1=pd.DataFrame(pca_1.fit_transform(X_train),columns=list(range(0,50)))
X_test_pca_1 = pd.DataFrame(pca_1.fit_transform(X_test))
X_test_pca_1

In [None]:
""""
#print(X_test_pca_1.explained_variance_)
#print(X_test_pca_1.explained_variance_ratio_)
#print(X_test_pca_1.explained_variance_ratio_.cumsum())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

##vif_pca = pd.DataFrame()
#vif_pca["VIF Factor_PCA"] = [variance_inflation_factor(X_train_pca_1.values, i) for i in range(X_train_pca_1.shape[1])]

#vif_pca['Column Name']=X_train_pca_1.columns

#vif_pca.T

In [None]:
###from statsmodels.stats.outliers_influence import variance_inflation_factor

##vif = pd.DataFrame()
##vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

##vif['Column Name']=X_train.columns
##vif.T


In [None]:
# Multiple linear regression algorithm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

lin_reg=LinearRegression()

lin_reg.fit(X_train_pca_1,y_train)

y_linreg_pred=lin_reg.predict(X_test_pca_1)

r2_score(y_true=y_test,y_pred=y_linreg_pred)
print('Train Score:---------------',lin_reg.score(X_train_pca_1,y_train))
print('Test Score:----------------',lin_reg.score(X_test_pca_1,y_test))
print('model mean_squared_error: -',mean_squared_error(y_test, y_linreg_pred))


In [None]:
#Randomforest algorithm without PCA

from sklearn.ensemble import RandomForestRegressor
randf = RandomForestRegressor()
randf.fit(X_train, y_train)
pred = randf.predict(X_test)
print(r2_score(y_test, pred))
print(mean_squared_error(y_test,pred))

In [None]:
#Randomforest algorithm with PCA

from sklearn.ensemble import RandomForestRegressor
randf = RandomForestRegressor()
randf.fit(X_train_pca_1, y_train)
pred = randf.predict(X_test_pca_1)
print(r2_score(y_test, pred))
print(mean_squared_error(y_test,pred))

### 5. Predict your test_df values using xgboost