# Topics
1. Solving Classification Problem

<hr>

In [8]:
import pandas as pd

In [9]:
hr_data = pd.read_csv('../data-science-complete-tutorial/Data/HR_comma_sep.csv.txt')

In [12]:
hr_data.rename(columns={'sales':'dept'}, inplace=True)

In [13]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [14]:
hr_data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

### Inferances
* Target(left) is discrete in nature so a classification problem

In [15]:
target_data = hr_data.left

### Taking care of categorical columns
* dept & salary is categorical
* ML algorithms only understands numerical data
* We will use preprocessor LabelEncoder to convert this into numerical data

<hr>

In [16]:
from sklearn.preprocessing import LabelEncoder

In [18]:
le_salary = LabelEncoder()

In [19]:
le_salary.fit(hr_data.salary)

LabelEncoder()

In [22]:
le_salary.transform(['low','medium','low'])

array([1, 2, 1])

In [24]:
hr_data['salary_en'] = le_salary.transform(hr_data.salary)

In [25]:
le = LabelEncoder()

In [27]:
le.fit_transform(hr_data.salary)

array([1, 2, 2, ..., 1, 1, 1])

In [28]:
le_dept = LabelEncoder()

In [29]:
le_dept.fit(hr_data.dept)

LabelEncoder()

In [31]:
le_dept.transform(['sales','support'])

array([7, 8])

In [32]:
le_dept.classes_

array(['IT', 'RandD', 'accounting', 'hr', 'management', 'marketing',
       'product_mng', 'sales', 'support', 'technical'], dtype=object)

In [33]:
hr_data['dept_en'] = le_dept.transform(hr_data.dept)

In [35]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,salary_en,dept_en
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,7
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2,7
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2,7
3,0.72,0.87,5,223,5,0,1,0,sales,low,1,7
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,7


In [36]:
#getting feature data from the complete information
feature_data = hr_data.drop(columns=['dept','salary','left'])

In [37]:
#splitting feature & target data into train & test
from sklearn.model_selection import train_test_split

In [38]:
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)

In [39]:
trainX[:2]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en
8961,0.97,0.77,3,245,3,0,0,2,3
162,0.45,0.51,2,147,3,0,0,1,7


In [40]:
# Observation : You are not training the model with string information

# Classification Algorithms
* LogisticRegression, RandomForestClassifier, DecisicionTreeClassifier

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf = RandomForestClassifier()

In [43]:
rf.fit(trainX,trainY)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
data = [0.38,0.53,2,157,3,1,0,'support','high']

In [51]:
le_salary.transform(['high'])

array([0])

In [52]:
le_dept.transform(['support'])

array([8])

In [53]:
data = [0.38,0.53,2,157,3,1,0,8,0]

In [54]:
rf.predict([data])

array([1])

In [55]:
rf.predict(testX)

array([0, 1, 0, ..., 1, 1, 0])

In [56]:
rf.score(testX,testY)

0.9904

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
lr = LogisticRegression()

In [59]:
lr.fit(trainX,trainY)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
lr.score(testX,testY)

0.7541333333333333

In [61]:
lr.predict(testX)

array([0, 0, 0, ..., 1, 0, 0])

In [64]:
testX['lr_predict'] = lr.predict(testX)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [66]:
testX['rf_predict'] = rf.predict(testX.drop(columns=['lr_predict']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [68]:
testX['actual'] = testY

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [69]:
testX.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en,lr_predict,rf_predict,actual
7731,0.73,0.62,3,181,3,0,0,1,7,0,0,0
14271,0.85,0.91,5,226,5,0,0,2,4,0,1,1
10083,0.55,0.62,5,184,4,0,0,2,5,0,0,0
221,0.37,0.5,2,135,3,0,0,1,6,0,1,1
332,0.09,0.98,6,271,4,0,0,1,7,1,1,1


In [71]:
testX[testX.actual != testX.rf_predict]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en,lr_predict,rf_predict,actual
813,0.15,0.55,6,139,4,0,0,0,8,0,0,1
1601,0.68,0.62,5,198,5,1,0,1,1,0,0,1
948,0.61,0.86,4,196,4,0,0,1,3,0,0,1
4519,0.44,0.45,2,124,3,0,0,1,7,0,1,0
1669,0.67,0.54,3,166,5,0,0,2,0,0,0,1
927,0.46,0.86,2,212,4,0,0,2,7,1,0,1
1541,0.61,0.46,5,220,4,0,0,1,7,0,0,1
6263,0.9,0.87,4,231,5,0,0,1,4,0,1,0
1580,0.49,0.73,4,244,3,0,0,1,8,0,0,1
10237,0.88,0.81,4,235,6,0,0,2,7,0,1,0
