# Классификация

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

**Теперь мы будем решать задачу классификации и предсказывать классификацию страны по доходам, целевая переменная у нас - Classification, её мы можем ранжировать**

In [131]:
df = pd.read_csv('df_vib.csv')
df.head(3)

Unnamed: 0,YEAR,DISORDER_TYPE,EVENT_TYPE,ACTOR1,INTER1,ACTOR2,INTER2,INTERACTION,CIVILIAN_TARGETING,COUNTRY,LOCATION,LATITUDE,LONGITUDE,NOTES,FATALITIES,index,classification,bool,violence_ind
0,2022,Demonstrations,Protests,Protesters (Italy),6,,0,60,,Italy,Cagliari,39.217,9.113,"On 15 March 2022, striking haulers from across...",0,0.895,High income,0,1.0
1,2020,Demonstrations,Protests,Protesters (United States),6,,0,60,,United States,Chicago,41.85,-87.65,"On 5 October 2020, student-athletes staged a p...",0,0.92,High income,0,1.0
2,1999,Strategic developments,Strategic developments,RUF: Revolutionary United Front,2,Civilians (Sierra Leone),7,27,,Sierra Leone,Bombalie Bana,8.973,-11.89,Looting,0,0.301,Low income,0,14.33


Сначала выкинем все лишние признаки

In [32]:
df_class_model = df.fillna(0)
df_class_model = df_class_model.drop(['ACTOR1', 'ACTOR2', 'INTER1', 'INTER2', 'COUNTRY', 'LOCATION', 'NOTES','CIVILIAN_TARGETING', 'LATITUDE', 'LONGITUDE', 'index','DISORDER_TYPE'], axis = 1)
df_class_model.head(2)

Unnamed: 0,YEAR,EVENT_TYPE,INTERACTION,FATALITIES,classification,bool,violence_ind
0,2022,Protests,60,0,High income,0,1.0
1,2020,Protests,60,0,High income,0,1.0


А также закодируем категориальные признаки:

In [33]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first')
encoder_df = pd.DataFrame(encoder.fit_transform(df_class_model[['EVENT_TYPE']]).toarray())
df_class_model = df_class_model.join(encoder_df).drop(['EVENT_TYPE'],axis = 1)
df_class_model = df_class_model.rename(columns={0:"Protests", 2: "Strategic developments", 1: 'Battles', 4: 'Riots', 2:'Violence against civilians', 3:'Explosions/Remote violence'})
df_class_model.head(2)

Unnamed: 0,YEAR,INTERACTION,FATALITIES,classification,bool,violence_ind,Protests,Battles,Violence against civilians,Explosions/Remote violence,Riots
0,2022,60,0,High income,0,1.0,0.0,1.0,0.0,0.0,0.0
1,2020,60,0,High income,0,1.0,0.0,1.0,0.0,0.0,0.0


In [38]:
tf = {"High income": 4, "Upper middle income": 3, 'Lower middle income':2, 'Low income':1}
df_class_model["classification"] = df_class_model["classification"].replace(tf)

In [39]:
df_class_model.head(3)

Unnamed: 0,YEAR,INTERACTION,FATALITIES,classification,bool,violence_ind,Protests,Battles,Violence against civilians,Explosions/Remote violence,Riots
0,2022,60,0,4,0,1.0,0.0,1.0,0.0,0.0,0.0
1,2020,60,0,4,0,1.0,0.0,1.0,0.0,0.0,0.0
2,1999,27,0,1,0,14.33,0.0,0.0,0.0,1.0,0.0


**Начнем опять с базы, KNN!**

In [40]:
from sklearn.model_selection import train_test_split
np.random.seed(321)
x_train, x_test = train_test_split(df_class_model, test_size = 0.2)
y_train, y_test = x_train['classification'], x_test['classification']
x_train, x_test= np.array(x_train.drop('classification', axis=1)), np.array(x_test.drop('classification', axis=1))

In [57]:
# И конечно для кнн нужен гридсерч
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1, weights='uniform', algorithm='ball_tree', leaf_size=30, p=2,metric_params=None, n_jobs=None)

param_grid = {
    'n_neighbors': np.arange(1, 500, 10),
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
     }
grid_search = GridSearchCV(model, param_grid, cv=2, scoring = 'f1_macro')
grid_search.fit(x_train, y_train)
grid_search.best_estimator_

In [78]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=61, algorithm='brute',  weights='distance')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba =  model.predict_proba(x_test)


In [82]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
print(f'f1: {f1_score(y_test, y_pred, average = "macro")} ' )
print(f'roc_auc: {roc_auc_score(y_test, y_pred_proba, multi_class= "ovo")}')

f1: 0.5363043093551687 
roc_auc: 0.7814529447922259


F-мера получилась не очень, а вот рок-аук порадовал, посмотрим, что выдадут модели по умнее

**Теперь попробуем обучить логистическую регрессию**

In [110]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', C = 10, max_iter=5000, penalty='l2', solver = 'lbfgs')
param_grid = {
    'C': np.arange(1, 200, 10),
    'multi_class': ['ovr', 'multinomial']
    }
grid_search = GridSearchCV(model, param_grid, cv=2, scoring = 'f1_macro')
grid_search.fit(x_train, y_train)
grid_search.best_estimator_

In [141]:
model = LogisticRegression(multi_class='multinomial', C = 71, max_iter=5000, penalty='l2', solver = 'lbfgs')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba =  model.predict_proba(x_test)
print(f'f1: {f1_score(y_test, y_pred, average = "macro")} ' )
print(f'roc_auc: {roc_auc_score(y_test, y_pred_proba, multi_class= "ovo")}')

f1: 0.27444744354224854 
roc_auc: 0.7118141780044446


Как видно, логистическая регрессия справилась куда хуже КНН, особенно по f-1 мере

**А сейчас обучим случайный лес**

In [139]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=1, max_depth=1, min_samples_split=1, min_samples_leaf=1, max_features=1, random_state=321)
param_grid = {
    'n_estimators': [10, 20, 30, 55, 58],
    'max_depth' : [5, 10, 15],
    'min_samples_split': [1 , 2 , 10 , 20],
    'min_samples_leaf': [5, 10],
    'max_features': [10, 20, 30]
}
grid_search = GridSearchCV(model, param_grid, cv=2, scoring = 'f1_macro')

grid_search.fit(x_train, y_train)

grid_search.best_estimator_

In [138]:
model = RandomForestClassifier(max_depth=10, max_features=10, min_samples_leaf=5,n_estimators=55 )
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba =  model.predict_proba(x_test)
print(f'f1: {f1_score(y_test, y_pred, average = "macro")}')
print(f'roc_auc: {roc_auc_score(y_test, y_pred_proba, multi_class= "ovo")}')

f1: 0.5705834512908676
roc_auc: 0.8112862959837545


Как видно, случайный лес смог немного перегнать KNN, что конечно же радует

**Можно сказать что задача классификации по выбранным признакам далась нашим моделям уже сложнее, далее мы попробуем поработать с текстом**