# Multi class classification

In [1]:
import pandas as pd
import numpy as np

In [2]:
cars = pd.read_csv('auto.csv')

In [4]:
unique_regions = cars['origin'].unique()

In [6]:
print(unique_regions)

[1 3 2]


# Nominal variables

In [7]:
cars['cylinders'].value_counts()

4    199
8    103
6     83
3      4
5      3
Name: cylinders, dtype: int64

In [8]:
cars['year'].value_counts()

73    40
78    36
76    34
75    30
82    30
70    29
79    29
72    28
77    28
81    28
71    27
80    27
74    26
Name: year, dtype: int64

In [9]:
dummy_df = pd.get_dummies(cars['cylinders'], prefix='cyl')

In [10]:
dummy_df

Unnamed: 0,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
...,...,...,...,...,...
387,0,1,0,0,0
388,0,1,0,0,0
389,0,1,0,0,0
390,0,1,0,0,0


In [11]:
cars = pd.concat([cars, dummy_df], axis=1)

In [12]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,0,0,0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,0,0,0,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,0,0,0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,0,0,0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,0,0,0,0,1


In [13]:
dummy_df_year = pd.get_dummies(cars['year'], prefix='year')

In [14]:
dummy_df_year

Unnamed: 0,year_70,year_71,year_72,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,0,0,0,0,0,0,0,0,0,0,0,0,1
388,0,0,0,0,0,0,0,0,0,0,0,0,1
389,0,0,0,0,0,0,0,0,0,0,0,0,1
390,0,0,0,0,0,0,0,0,0,0,0,0,1


In [15]:
cars = pd.concat([cars, dummy_df_year], axis=1)

In [16]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,cyl_3,cyl_4,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
cars = cars.drop(columns=['cylinders', 'year'])

In [18]:
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Randomization of dataframe 

In [20]:
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

In [21]:
train_index = int(0.7 * len(shuffled_cars))
print(train_index)

274


In [22]:
train_df = shuffled_cars.iloc[0:274]
test_df = shuffled_cars.iloc[274:]

# Train logistic regression model for multi class classification problem

In [24]:
from sklearn.linear_model import LogisticRegression

In [23]:
unique_origins = shuffled_cars['origin'].unique() 

In [25]:
unique_origins.sort()

In [26]:
unique_origins

array([1, 2, 3])

In [28]:
models = {}
features = [c for c in train_df.columns if c.startswith('cyl') or c.startswith('year')]

In [29]:
for origin in unique_origins:
    model = LogisticRegression(solver='liblinear')
    
    X_train = train_df[features]
    Y_train = train_df['origin'] == origin
    
    model.fit(X_train, Y_train)
    models[origin] = model

# Testing models

In [30]:
models

{1: LogisticRegression(solver='liblinear'),
 2: LogisticRegression(solver='liblinear'),
 3: LogisticRegression(solver='liblinear')}

In [32]:
testing_probs = pd.DataFrame(columns=unique_origins)

for origin in unique_origins:
    X_test = test_df[features]
    
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

testing_probs

Unnamed: 0,1,2,3
0,0.976688,0.013796,0.035259
1,0.335963,0.268576,0.383134
2,0.875219,0.102363,0.042423
3,0.335963,0.268576,0.383134
4,0.313735,0.339416,0.336997
...,...,...,...
113,0.220181,0.403051,0.383843
114,0.553527,0.153038,0.305097
115,0.411032,0.231102,0.346885
116,0.981335,0.019195,0.020544


# Choose origin

In [33]:
predicted_origins = testing_probs.idxmax(axis=1)

In [34]:
print(predicted_origins.head())
print(testing_probs.head())

0    1
1    3
2    1
3    3
4    2
dtype: int64
          1         2         3
0  0.976688  0.013796  0.035259
1  0.335963  0.268576  0.383134
2  0.875219  0.102363  0.042423
3  0.335963  0.268576  0.383134
4  0.313735  0.339416  0.336997
