# Data Preprocessing

## Importing the libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

## Importing the dataset

In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

In [None]:
print(X)

[['verb' 7 4 ... 0 0 2]
 ['noun' 2 1 ... 0 580 11]
 ['adjective' 4 2 ... 0 0 11]
 ...
 ['noun' 6 4 ... 0 0 4]
 ['number' 9 0 ... 0 0 1]
 ['noun' 6 2 ... 0 0 3]]


In [None]:
print(y)

['b2' 'a2' 'a2' ... 'b1' 'a2' 'b2']


## Encoding


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(X)

[[0.0 0.0 0.0 ... 0 0 2]
 [0.0 0.0 0.0 ... 0 580 11]
 [1.0 0.0 0.0 ... 0 0 11]
 ...
 [0.0 0.0 0.0 ... 0 0 4]
 [0.0 0.0 0.0 ... 0 0 1]
 [0.0 0.0 0.0 ... 0 0 3]]


In [None]:
print(y)

[3 1 1 ... 2 1 3]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

[[0.0 0.0 0.0 ... 0 464 4]
 [0.0 0.0 0.0 ... 0 0 3]
 [0.0 0.0 0.0 ... 0 0 4]
 ...
 [0.0 0.0 0.0 ... 0 323 4]
 [1.0 0.0 0.0 ... 0 0 20]
 [0.0 0.0 0.0 ... 0 0 2]]


In [None]:
print(X_test)

[[0.0 0.0 0.0 ... 0 464 71]
 [0.0 0.0 0.0 ... 0 356 4]
 [0.0 0.0 0.0 ... 0 0 5]
 ...
 [0.0 0.0 0.0 ... 0 0 4]
 [0.0 0.0 0.0 ... 0 0 3]
 [0.0 0.0 0.0 ... 0 0 5]]


In [None]:
print(y_train)

[3 2 2 ... 1 0 0]


In [None]:
print(y_test)

[3 4 2 1 4 4 3 1 4 0 4 4 2 3 4 4 3 4 2 0 4 0 1 3 0 3 1 4 1 0 2 3 3 0 4 4 4
 1 3 4 0 3 0 1 0 3 1 4 3 3 1 1 0 3 3 4 3 0 2 4 0 3 1 4 2 4 0 3 1 1 3 4 2 0
 3 2 4 3 4 4 0 2 3 4 2 4 3 1 1 0 2 3 3 4 4 0 0 0 0 4 4 4 4 3 0 0 4 1 1 0 4
 3 0 4 0 3 2 1 1 0 4 4 1 1 2 4 0 4 3 2 2 3 4 1 3 2 0 2 4 1 4 4 3 3 3 1 0 3
 4 0 0 1 3 1 4 4 1 2 1 4 4 0 0 3 3 3 0 4 4 1 3 0 3 1 3 4 2 1 3 1 3 3 3 3 3
 2 3 1 2 0 3 1 3 0 2 3 4 0 3 2 2 1 3 4 3 0 0 4 3 2 0 0 3 2 4 1 0 4 0 4 4 4
 3 3 3 3 4 1 1 3 2 3 2 0 3 0 3 0 1 4 3 1 0 0 3 3 4 1 0 4 3 1 4 3 2 1 0 4 1
 0 1 0 2 0 3 3 0 2 3 1 0 3 0 0 3 0 3 3 0 4 1 3 3 0 3 3 1 3 4 4 0 4 3 2 4 4
 4 3 3 2 3 0 2 4 2 3 3 0 3 1 4 4 2 3 4 4 1 3 0 4 4 2 2 4 1 3 1 1 0 4 3 1 3
 0 2 3 3 3 2 1 4 2 2 0 4 0 1 2 1 4 2 2 1 4 3 4 0 4 3 0 3 3 4 3 4 4 4 3 1 1
 4 2 3 1 3 1 3 3 2 4 3 4 0 4 2 4 2 2 0 1 1 4 4 0 4 4 1 3 2 4 0 0 3 3 3 2 0
 1 4 1 0 1 3 3 3 0 3 1 3 4 4 3 4 1 3 2 3 0 1 4 3 3 4 2 3 2 2 0 0 4 4 3 4 1
 4 1 4 4 2 3 2 1 0 1 0 3 4 1 0 1 0 4 4 4 2 4 4 2 4 4 3 4 1 2 1 4 0 1 0 4 1
 4 4 3 2 4 4 2 0 3 0 0 3 

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# encoded features are not scaled
X_train[:, 9:] = sc.fit_transform(X_train[:, 9:]) 
X_test[:, 9:] = sc.transform(X_test[:, 9:])

In [None]:
print(X_train)

[[0.0 0.0 0.0 ... -0.25824238955886414 0.15301457120237313
  -0.4315313831333375]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.47932395691964746]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.4315313831333375]
 ...
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.03169179521574734
  -0.4315313831333375]
 [1.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  0.333149797447622]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.5271165307059574]]


In [None]:
print(X_test)

[[0.0 0.0 0.0 ... -0.25824238955886414 0.15301457120237313
  2.7705710605494307]
 [0.0 0.0 0.0 ... -0.25824238955886414 0.011537354371472344
  -0.4315313831333375]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.3837388093470275]
 ...
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.4315313831333375]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.47932395691964746]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.3837388093470275]]


# XGBoost

## Training XGBoost on the Training set

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred= xgb.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[3 3]
 [4 4]
 [3 2]
 ...
 [2 2]
 [4 4]
 [1 0]]


## Evaluating the Model's Performance

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.73      0.71       166
           1       0.44      0.42      0.43       160
           2       0.47      0.38      0.42       145
           3       0.55      0.63      0.58       267
           4       0.74      0.70      0.72       254

    accuracy                           0.59       992
   macro avg       0.58      0.57      0.57       992
weighted avg       0.59      0.59      0.59       992



In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.592741935483871

In [None]:
# adjacent level accuracy
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
n_samples = sum(sum(cm))
adjacent_level_accuracy = (cm.trace() + cm.trace(offset=1) + cm.trace(offset=-1)) / n_samples
print(adjacent_level_accuracy)

0.8860887096774194


In [None]:
#The average difference between the predicted level and the actual level:
np.mean(np.abs(y_test - y_pred))

0.5473790322580645

In [None]:
pd.crosstab(y_test, y_pred)

col_0,0,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,121,30,6,7,2
1,37,67,21,30,5
2,12,31,55,36,11
3,5,23,28,167,44
4,1,3,8,64,178


# Classifying 14,000 new words

In [None]:
def classify_word(word):
  word = np.array(word)
  word[:, 9:] = sc.transform(word[:, 9:]) # Feature scaling (exclusing the encoded features)
  pred = str(xgb.predict(word))
  if pred == '[0]':
    level = 'A1'
  elif pred == '[1]':
    level = 'A2'
  elif pred == '[2]':
    level = 'B1'
  elif pred == '[3]':
    level = 'B2'
  elif pred == '[4]':
    level = 'C1'
  return level

In [None]:
df = pd.read_csv('unclassified_words.csv')

In [None]:
X = df.iloc[:, 2:].values # the first two columns are the word and the definition

The predicted levels for new words can be found in the file "predicted_levels.txt" under Files after running the code below.

In [None]:
file1 = open("predicted_levels.txt","w") # create a new file to store the predicted levels
for row in X:
  level = classify_word([row.tolist()])
  file1.write(level + "\n")
file1.close()

Example words:

In [None]:
# pizza (noun) = A1
word1 = [[0, 0, 0, 0, 1, 0, 0, 0, 0, 1,	1,	1,	1,	0,	14763787,	1703,	33.39215686,	1605,	42.0908,	1172,	0.1303,	3791,	122.3764,	3455,	1527,	43.3605,	1206,	0.1194,	5,	941.7395833,	565.35915,	1954,	9540,	-7586,	5,	9,	2,	1,	0,	0,	0,	0,	0,	1]]
print('pizza: ' + classify_word(word1))

pizza: A1


In [None]:
# adopted (adjective) = B2
word2 = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 2,	1,	0,	0,	1,	22972983,	69,	1.352941176,	853,	22.3698,	795,	0.0884,	233,	7.5214,	232,	1070,	30.3836,	1056,	0.1045,	2.39,	941.8314176,	467.5664354,	1780,	702,	1078,	7,	10,	3,	1,	0, 0,	0,	0,	0,	4]]	
print('adopted: ' + classify_word(word2))

adopted: B2


In [None]:
# abnormal (adjective) = C1
word3 = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 3,	1,	0,	0,	2,	3963331,	147,	2.882352941,	138,	3.619,	132,	0.0147,	26,	0.8393,	26,	87,	2.4704,	84,	0.0083,	2.14,	1044.260504,	577.143723,	9294,	1938,	7356,	8,	12,	3,	3,	1,	2,	2,	24,	1431,	14]]
print('abnormal: ' + classify_word(word3))

abnormal: C1


# Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 54.48 %
Standard Deviation: 1.84 %
