# Data Preprocessing

## Importing the libraries

In [46]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

## Importing the dataset

In [47]:
df = pd.read_csv('main_dataset.csv')

In [48]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

In [49]:
print(X)

[['verb' 7 4 ... 0 0 0]
 ['noun' 2 1 ... 3 0 580]
 ['adjective' 4 2 ... 0 0 0]
 ...
 ['noun' 6 4 ... 0 0 0]
 ['number' 9 0 ... 0 0 0]
 ['noun' 6 2 ... 0 0 0]]


In [50]:
print(y)

['b2' 'a2' 'a2' ... 'b1' 'a2' 'b2']


## Encoding


In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [52]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [53]:
print(X)

[[0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 3 0 580]
 [1.0 0.0 0.0 ... 0 0 0]
 ...
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]]


In [54]:
print(y)

[3 1 1 ... 2 1 3]


## Splitting the dataset into the Training set and Test set

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [56]:
print(X_train)

[[0.0 0.0 0.0 ... 3 0 464]
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]
 ...
 [0.0 0.0 0.0 ... 4 0 323]
 [1.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]]


In [57]:
print(X_test)

[[0.0 0.0 0.0 ... 3 0 464]
 [0.0 0.0 0.0 ... 3 0 356]
 [0.0 0.0 0.0 ... 0 0 0]
 ...
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 0.0 ... 0 0 0]]


In [58]:
print(y_train)

[3 2 2 ... 1 0 0]


In [59]:
print(y_test)

[3 4 2 1 4 4 3 1 4 0 4 4 2 3 4 4 3 4 2 0 4 0 1 3 0 3 1 4 1 0 2 3 3 0 4 4 4
 1 3 4 0 3 0 1 0 3 1 4 3 3 1 1 0 3 3 4 3 0 2 4 0 3 1 4 2 4 0 3 1 1 3 4 2 0
 3 2 4 3 4 4 0 2 3 4 2 4 3 1 1 0 2 3 3 4 4 0 0 0 0 4 4 4 4 3 0 0 4 1 1 0 4
 3 0 4 0 3 2 1 1 0 4 4 1 1 2 4 0 4 3 2 2 3 4 1 3 2 0 2 4 1 4 4 3 3 3 1 0 3
 4 0 0 1 3 1 4 4 1 2 1 4 4 0 0 3 3 3 0 4 4 1 3 0 3 1 3 4 2 1 3 1 3 3 3 3 3
 2 3 1 2 0 3 1 3 0 2 3 4 0 3 2 2 1 3 4 3 0 0 4 3 2 0 0 3 2 4 1 0 4 0 4 4 4
 3 3 3 3 4 1 1 3 2 3 2 0 3 0 3 0 1 4 3 1 0 0 3 3 4 1 0 4 3 1 4 3 2 1 0 4 1
 0 1 0 2 0 3 3 0 2 3 1 0 3 0 0 3 0 3 3 0 4 1 3 3 0 3 3 1 3 4 4 0 4 3 2 4 4
 4 3 3 2 3 0 2 4 2 3 3 0 3 1 4 4 2 3 4 4 1 3 0 4 4 2 2 4 1 3 1 1 0 4 3 1 3
 0 2 3 3 3 2 1 4 2 2 0 4 0 1 2 1 4 2 2 1 4 3 4 0 4 3 0 3 3 4 3 4 4 4 3 1 1
 4 2 3 1 3 1 3 3 2 4 3 4 0 4 2 4 2 2 0 1 1 4 4 0 4 4 1 3 2 4 0 0 3 3 3 2 0
 1 4 1 0 1 3 3 3 0 3 1 3 4 4 3 4 1 3 2 3 0 1 4 3 3 4 2 3 2 2 0 0 4 4 3 4 1
 4 1 4 4 2 3 2 1 0 1 0 3 4 1 0 1 0 4 4 4 2 4 4 2 4 4 3 4 1 2 1 4 0 1 0 4 1
 4 4 3 2 4 4 2 0 3 0 0 3 

## Feature Scaling

In [60]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 9:] = sc.fit_transform(X_train[:, 9:])
X_test[:, 9:] = sc.transform(X_test[:, 9:])

In [61]:
print(X_train)

[[0.0 0.0 0.0 ... 1.5232996743901914 -0.25824238955886414
  0.15301457120237313]
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]
 ...
 [0.0 0.0 0.0 ... 2.2181780923558048 -0.25824238955886414
  -0.03169179521574734]
 [1.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]]


In [62]:
print(X_test)

[[0.0 0.0 0.0 ... 1.5232996743901914 -0.25824238955886414
  0.15301457120237313]
 [0.0 0.0 0.0 ... 1.5232996743901914 -0.25824238955886414
  0.011537354371472344]
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]
 ...
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]
 [0.0 0.0 0.0 ... -0.5613355795066479 -0.25824238955886414
  -0.45481347147853396]]


In [63]:
X_train.shape

(3965, 47)

In [64]:
X_test.shape

(992, 47)

In [65]:
y_train.shape

(3965,)

In [66]:
y_test.shape

(992,)

# XGBoost

## Training XGBoost on the Training set

In [67]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

## Applying Grid Search to find the best parameters

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
parameters = {'colsample_bytree': [1.0],
              'eta': [0.01],
              'eval_metric': ['mae'],
              'max_depth': [10],
              'min_child_weight': [6],
              'objective': ['reg:linear'],
              'subsample': [0.8]}

grid_search = GridSearchCV(estimator = xgb,
                           param_grid = parameters,
                           n_jobs=5, 
                           cv=10, 
                           scoring='roc_auc',
                           verbose=2, refit=True)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 10 folds for each of 1 candidates, totalling 10 fits




Best Accuracy: nan %
Best Parameters: {'colsample_bytree': 1.0, 'eta': 0.01, 'eval_metric': 'mae', 'max_depth': 10, 'min_child_weight': 6, 'objective': 'reg:linear', 'subsample': 0.8}


In [None]:
xgb = XGBClassifier(colsample_bytree=best_parameters['colsample_bytree'], eta=best_parameters['eta'], eval_metric=best_parameters['eval_metric'], max_depth=best_parameters['max_depth'], min_child_weight=best_parameters['min_child_weight'], objective=best_parameters['objective'], subsample=best_parameters['subsample'])
xgb.fit(X_train, y_train)

## Predicting the Test set results

In [68]:
y_pred= xgb.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[3 3]
 [4 4]
 [3 2]
 ...
 [2 2]
 [4 4]
 [1 0]]


## Evaluating the Model's Performance

In [69]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.70      0.75      0.73       166
           1       0.46      0.42      0.44       160
           2       0.46      0.37      0.41       145
           3       0.53      0.59      0.56       267
           4       0.70      0.71      0.71       254

    accuracy                           0.59       992
   macro avg       0.57      0.57      0.57       992
weighted avg       0.58      0.59      0.58       992



In [70]:
pd.crosstab(y_test, y_pred)

col_0,0,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,124,27,7,5,3
1,37,68,24,28,3
2,10,26,53,44,12
3,4,22,26,157,58
4,1,4,6,63,180


## Classifying new words

In [83]:
def classify_new_words(word):
  word = np.array(word)
  word[:, 9:] = sc.transform(word[:, 9:])
  pred = str(xgb.predict(word))
  if pred == '[0]':
    level = 'A1'
  elif pred == '[1]':
    level = 'A2'
  elif pred == '[2]':
    level = 'B1'
  elif pred == '[3]':
    level = 'B2'
  elif pred == '[4]':
    level = 'C1'
  return level

In [84]:
# pizza (noun) = A1
word1 = [[0, 0, 0, 0, 1, 0, 0, 0, 0,	1,	1,	1,	1,	0,	14763787,	1703,	33.3921568627451,	1605,	42.0908,	1172,	0.1303,	3791,	122.3764,	3455,	0.1464,	1527,	43.3605,	1206,	0.1194,	5,	0,	28,	565.359149989908,	-0.739931543512408,	1954,	9540,	-7586,	5,	9,	2,	1,	0,	0,	0,	0,	0,	0]]
print('pizza: ' + classify_new_words(word1))

pizza: A1


In [85]:
# calendar (noun) = A2
word2 = [[0, 0, 0, 0, 1, 0, 0, 0, 0,	4,	0,	3,	3,	0,	101303808,	363,	7.11764705882353,	865,	22.6845,	751,	0.0835,	938,	30.2794,	914,	0.0387,	665,	18.8833,	627,	0.0621,	4.62,	0,	29,	581.597238968368,	-0.684524331538036,	2342,	919,	1423,	8,	10,	3,	1,	0,	0,	0,	0,	0,	0]]
print('calendar: ' + classify_new_words(word2))

calendar: A2


In [86]:
# awake (adjective) = B1
word3 = [[1, 0, 0, 0, 0, 0, 0, 0, 0,	4,	1,	0,	0,	1,	3230337,	1105,	21.6666666666667,	790,	20.7176,	745,	0.0828,	1891,	61.0429,	1856,	0.0786,	166,	4.7137,	163,	0.0161,	3.32,	0,	28,	369.272520405629,	-0.713699907623462,	4143,	6493,	-2350,	5,	9,	2,	2,	1,	0,	1,	0,	309,	0]]
print('awake: ' + classify_new_words(word3))

awake: B1


In [87]:
# adopted (adjective) = B2
word4 = [[1, 0, 0, 0, 0, 0, 0, 0, 0,	2,	1,	0,	0,	1,	22972983,	69,	1.35294117647059,	853,	22.3698,	795,	0.0884,	233,	7.5214,	232,	0.0098,	1070,	30.3836,	1056,	0.1045,	2.39,	0,	28,	467.5664353551,	-0.670148128487854,	1780,	702,	1078,	7,	10,	3,	1,	0,	0,	0,	0,	0,	0]]
print('adopted: ' + classify_new_words(word4))

adopted: B2


In [88]:
# abnormal (adjective) = C1
word5 = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 2, 3963331, 147, 2.88235294117647, 138, 3.619, 132, 0.0147, 26, 0.8393, 26, 0.0011, 87, 2.4704, 84, 0.0083, 2.14, 0, 29, 577.143723000519, -0.515106505596673, 9294, 1938, 7356, 8, 12, 3, 3, 1, 1, 2, 2, 24, 1431]]
print('abnormal: ' + classify_new_words(word5))

abnormal: C1
