In this notebook, the text classification task is performed. First the 3 datasets are imported, then 'text preprocessing' is applied (detailed info in Master Thesis). Once this has been done, all of the five chosen models are trained and tested, after finding the optimal parameters.

Then all these predictions are saved, as they are the input for the meta-learner. These files are used in "GH - Ensemble Model".

In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import re
from nltk.stem.snowball import SnowballStemmer
from stop_words import get_stop_words
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#### Import 'training dataset(1)', 'training dataset(2)', and 'validation dataset(2)'

In [2]:
train1 = pd.read_excel("strat_train1_data.xlsx")
train2 = pd.read_excel("strat_train2_data.xlsx")
val2 = pd.read_excel("strat_val2_data.xlsx")

In [3]:
def text_preprocess(dataset):
    dataset = dataset.fillna('nan')
    dataset['longdescription'] = dataset['longdescription'].str.replace('nan', '')
    dataset['keywords'] = dataset['keywords'].str.replace('nan', '')
    dataset['text_input'] = dataset['shortdescription'] + ' ' + dataset['longdescription'].fillna('') + ' ' + dataset['namemanufacturer'] + ' ' + dataset['keywords'].fillna('')

    return dataset

In [4]:
#Apply all text preprocessing to the datasets
train1 = text_preprocess(train1)
train2 = text_preprocess(train2)
val2 = text_preprocess(val2)

In [5]:
#Create corpus of the training and validation data

def create_corpus(listname, dataset):
    stop_words1 = get_stop_words('german')

    for i in range(0,len(dataset.text_input)):
        review1 = re.sub('[^a-zA-ZüäöØ°C0-9()ß-]',' ', str(dataset.text_input[i])) ## ADDED ö, Diameter, Degree Celsius
        review1 = review1.lower()
        review1 = review1.split()
        stemmer1 = SnowballStemmer("german")
        review1 = [stemmer1.stem(word) for word in review1 if not word in set(stop_words1)]
        review1 =  ' '.join(review1)
        listname.append(review1)
        
    return listname

In [6]:
corpus_train1 = []
corpus_train2 = []
corpus_val2 = []

corpus_train1 = create_corpus(corpus_train1, train1)
corpus_train2 = create_corpus(corpus_train2, train2)
corpus_val2 = create_corpus(corpus_val2, val2)

In [7]:
cv = CountVectorizer(max_features = 50000)
X_train1 = cv.fit_transform(corpus_train1)

#train2 and val2 only need to be transformed, as they are both subsets of train1
X_train2 = cv.transform(corpus_train2)
X_val2 = cv.transform(corpus_val2)

In [8]:
tfidf = TfidfTransformer()
X_train1 = tfidf.fit_transform(X_train1)

#Here as well, only transformation needed.
X_train2 = tfidf.transform(X_train2)
X_val2 = tfidf.transform(X_val2)
print(X_train1.shape)
print(X_train2.shape)
print(X_val2.shape)

(15678, 35581)
(3922, 35581)
(1308, 35581)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [9]:
Y_train1 = train1.Label
Y_train2 = train2.Label
Y_val2 = val2.Label
print(Y_train1.shape)
print(Y_train2.shape)
print(Y_val2.shape)

(15678,)
(3922,)
(1308,)


## Section 2: GridSearch of models & train/test

Importing the necessary modules

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

The creation of Stratified K-folds has been done in 'Create_TFRecords.ipynb', as the image split was done there as well.

#### KNN model, gridsearch to find optimal parameters first


In [28]:
values_n = range(1,20)
weight_options = ['uniform', 'distance']
param_grid1 = dict(n_neighbors = values_n, weights = weight_options)

sk_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, param_grid1, cv=sk_fold, verbose=3, scoring = 'accuracy')
grid.fit(X_train1, Y_train1)
#grid.cv_results_
grid.best_params_

Now use the optimal parameters, found in GridSearch, to fit and predict with the model

In [11]:
#%%timeit
#Optimal params: n_neighbors = 2, weights = 'distance'
knn = KNeighborsClassifier(n_neighbors = 2, weights = 'distance')
knn.fit(X_train1, Y_train1)

knn_predicted_train2 = knn.predict(X_train2)
knn_predicted_val2 = knn.predict(X_val2)

print("Acc train2: ", accuracy_score(Y_train2, knn_predicted_train2))
print("Acc val2: ", accuracy_score(Y_val2, knn_predicted_val2))


Acc train2:  0.9472208057113718
Acc val2:  0.9457186544342507


In [14]:
knn_proba_train2 = knn.predict_proba(X_train2)
knn_proba_val2 = knn.predict_proba(X_val2)

knn_predlabel_train2 = pd.DataFrame(knn_predicted_train2)
knn_predlabel_val2 = pd.DataFrame(knn_predicted_val2)
knn_df_train2 = pd.DataFrame(knn_proba_train2)
knn_df_val2 = pd.DataFrame(knn_proba_val2)

knn_predlabel_train2.to_excel("KNN_labelpredictions_train2.xlsx")
knn_predlabel_val2.to_excel("KNN_labelpredictions_val2.xlsx")
knn_df_train2.to_excel("KNN_Predictions_train2.xlsx")
knn_df_val2.to_excel("KNN_Predictions_val2.xlsx")

#### Use LogReg with OneVsRest 

In [None]:
p_logreg = {
    "estimator__solver": ['liblinear', 'lbfgs', 'newton-cg', 'sag'], ## lbfgs, newton-cg, sag are inferior to LIBLINEAR
    "estimator__multi_class": ["ovr"],
    "estimator__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

sk_logreg = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

clf_logreg = OneVsRestClassifier(LogisticRegression())

grid = GridSearchCV(clf_logreg, p_logreg, cv=sk_logreg)
grid.fit(X_train1, Y_train1)
grid.cv_results_
grid.best_params_

In [13]:
# OPTIMAL PARAMS: C=100, multi_class = 'ovr', solver = 'liblinear'
lr_ovr = OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class = 'ovr', C=100)) ##CHECK, MODELS ARE THE SAME!
lr_ovr.fit(X_train1, Y_train1)

lr_pred_train2 = lr_ovr.predict(X_train2)
lr_pred_val2 = lr_ovr.predict(X_val2)
print("Acc train2: ", accuracy_score(Y_train2, lr_pred_train2))
print("Acc val2: ", accuracy_score(Y_val2, lr_pred_val2))

Acc train2:  0.9553799082100969
Acc val2:  0.9548929663608563


Predict Probabilities

In [14]:
lr_proba_train2 = lr_ovr.predict_proba(X_train2)
lr_proba_val2 = lr_ovr.predict_proba(X_val2)

lr_df_train2 = pd.DataFrame(lr_proba_train2)
lr_df_val2 = pd.DataFrame(lr_proba_val2)

lr_df_train2.to_excel("LR_Predictions_train2.xlsx")
lr_df_val2.to_excel("LR_Predictions_val2.xlsx")

#### Naïve Bayes
No grid search is applicable to MNB

In [32]:
nb_ovr = OneVsRestClassifier(MultinomialNB())
nb_ovr.fit(X_train1, Y_train1)

nb_pred_train2 = nb_ovr.predict(X_train2)
nb_pred_val2 = nb_ovr.predict(X_val2)

print("Acc train2: ", accuracy_score(Y_train2, nb_pred_train2))
print("Acc:", accuracy_score(Y_val2, nb_pred_val2))

Acc train2:  0.8220295767465579
Acc: 0.8241590214067278


Predict Probabilities

In [94]:
nb_proba_train2 = nb_ovr.predict_proba(X_train2)
nb_proba_val2 = nb_ovr.predict_proba(X_val2)

nb_df_train2 = pd.DataFrame(nb_proba_train2)
nb_df_val2 = pd.DataFrame(nb_proba_val2)

nb_df_train2.to_excel("NB_Predictions_train2.xlsx")
nb_df_val2.to_excel("NB_Predictions_val2.xlsx")

### SVC model
When kernel is set to linear, gamma has no influence.
GridSearch runs for a couple of hours (>15 hours)

In [5]:
p_svc = {
    "estimator__C": [0.001, 0.01, 0.1, 1, 10, 100], ##Eerst stond hier: 1,10,100,1000
    "estimator__kernel": ["linear", "rbf", "poly", "sigmoid"] ## AND RBF POLY SIGMOID
    "estimator__gamma":[0.001, 0.01, 0.1, 1],
}




sk_SVC = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

clf3 = OneVsRestClassifier(SVC())

grid = GridSearchCV(clf3, p_svc, cv=sk_SVC, verbose=3)
grid.fit(X_train1, Y_train1)
grid.cv_results_
grid.best_params_

In [33]:
#Optimal params: C=10, kernel= 'linear', gamma = 1
svc_ovr = OneVsRestClassifier(SVC(C=10 , kernel='linear', gamma=1, probability=False))
svc_ovr.fit(X_train1, Y_train1)

svc_predicted_train2 = svc_ovr.predict(X_train2)
svc_predicted_val2 = svc_ovr.predict(X_val2)

print("Acc train2: ", accuracy_score(Y_train2, svc_predicted_train2))
print("Acc val2:", accuracy_score(Y_val2, svc_predicted_val2))

Acc train2:  0.9576746557878634
Acc val2: 0.9541284403669725


In [96]:
svc_predlabel_train2 = pd.DataFrame(svc_predicted_train2)
svc_predlabel_val2 = pd.DataFrame(svc_predicted_val2)

svc_predlabel_train2.to_excel("SVC_labelpredictions_train2.xlsx")
svc_predlabel_val2.to_excel("SVC_labelpredictions_val2.xlsx")

Predict Probabilities

In [96]:
svc_proba_train2 = svc_ovr.predict_proba(X_train2)
svc_proba_val2 = svc_ovr.predict_proba(X_val2)

In [97]:
svc_df_train2 = pd.DataFrame(svc_proba_train2)
svc_df_val2 = pd.DataFrame(svc_proba_val2)

svc_df_train2.to_excel("SVC_Predictions_train2.xlsx")
svc_df_val2.to_excel("SVC_Predictions_val2.xlsx")


## DecisionTree Classifier
Runs for approx. 30 minutes

In [6]:
parameters = {
    "estimator__criterion": ['gini', 'entropy'],
    "estimator__splitter":['best', 'random'],
}


sk_DT = StratifiedKFold(n_splits=3, shuffle=True, random_state=4)

clf4 = OneVsRestClassifier(DecisionTreeClassifier())

grid4 = GridSearchCV(clf4, parameters, cv=sk_DT, n_jobs=-1, verbose=3)
grid4.fit(X_train1, Y_train1)
grid4.cv_results_
grid4.best_params_

In [34]:
#Optimal params: criterion = 'entropy', splitter = 'random'

dt_ovr = OneVsRestClassifier(DecisionTreeClassifier(criterion = 'entropy', splitter = 'random'))
dt_ovr.fit(X_train1, Y_train1)

dt_pred_train2 = dt_ovr.predict(X_train2)
dt_pred_val2 = dt_ovr.predict(X_val2)

print("Acc train2:", accuracy_score(Y_train2, dt_pred_train2))
print("Acc val2:", accuracy_score(Y_val2, dt_pred_val2))

Acc train2: 0.8908720040795512
Acc val2: 0.8876146788990825


Predict Probabilities

In [8]:
dt_proba_train2 = dt_ovr.predict_proba(X_train2)
dt_proba_val2 = dt_ovr.predict_proba(X_val2)

dt_df_train2 = pd.DataFrame(dt_proba_train2)
dt_df_val2 = pd.DataFrame(dt_proba_val2)

dt_df_train2.to_excel("DT_Predictions_train2.xlsx")
dt_df_val2.to_excel("DT_Predictions_val2.xlsx")

## Statistics calculation
Metrics revelant: Accuracy (F1-Micro) and F1-Macro
Accuracy & F1-Micro are supposed to be equal (multi-class setting), thus check is performed.

Further below, Accuracy (F1-Micro) and F1-Macro are calculated for IRNV2 model


In [20]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

In [18]:
def metrics_calc(pred):
    print("Acc: ", accuracy_score(Y_val2, pred))
    print("F1-micro: ", f1_score(Y_val2, pred, average = 'micro'))
    print("F1-macro: ", f1_score(Y_val2, pred, average = 'macro'))
    

In [7]:
print("KNN:")
metrics_calc(knn_predicted_val2)
print("LR:")
metrics_calc(lr_pred_val2)
print("NB:")
metrics_calc(nb_pred_val2)
print("SVM:")
metrics_calc(svc_predicted_val2)
print("DT:")
metrics_calc(dt_pred_val2)

In [15]:
## CODE COPIED FROM "Create TFRecords ipynb", to map the labels according to dictionary values (needed to make top-5 acc)
excel_labels275 = pd.read_excel('labels_left.xlsx')
#print(len(excel_labels275))
excel_labels275 = excel_labels275.drop_duplicates()
#print(len(excel_labels275))
list_labels275 = list(excel_labels275['labels_left'])

class_names_sorted = [int(x) for x in list_labels275]
class_names_sorted.sort(key=float)
class_names_string = [str(item) for item in class_names_sorted]
class_names_to_ids = dict(zip(class_names_string, range(len(class_names_string))))
#class_names_to_ids

#The mapping has to be done other way around, in this case
ids_to_class_names = {v: k for k, v in class_names_to_ids.items()}


#### Top 1 Accuracy Inception ResNet V2 (Image model) & F1 micro/F1 macro

In [16]:
image_val_input = pd.read_excel('image_input_val2_ensemble.xlsx')

In [17]:
image_val_input['Probabilities'] = image_val_input['Probabilities'].str.replace('[', '')
image_val_input['Probabilities'] = image_val_input['Probabilities'].str.replace(']', '')

In [18]:
#image_probabilities = image_probabilities.values.tolist()
image_val_prob = []
for row1 in image_val_input['Probabilities']:
    probclass1 = row1.split()
    probclass1 = list(map(float, probclass1))
    image_val_prob.append(probclass1)

In [19]:
image_val_prob = np.array(image_val_prob)

In [41]:
def top1and5_acc_image(probability_predictions, ids_to_class_names):
    
    df_proba = pd.DataFrame(probability_predictions)
    prediction_top_proba = []
    for index, row in df_proba.iterrows():
        df_proba = df_proba = df_proba.rename(columns=ids_to_class_names)
        top_predictions = df_proba.sort_values(by=index, axis=1, ascending = False).columns.values[0:5]
        prediction_top_proba.append(top_predictions)

    prediction_top_proba = [list(map(int, row)) for row in prediction_top_proba]
    predictions_top_df = pd.DataFrame({'Y_val2': Y_val2, 'Y_top_proba': prediction_top_proba })
    total_1 = 0
    total_5 = 0
    for index, row in predictions_top_df.iterrows():
        real = predictions_top_df['Y_val2'].loc[index]
        predicted_list = predictions_top_df['Y_top_proba'].loc[index]
        true_pred_1 = 0
        true_pred_5 = 0
        
        for prediction_1 in predicted_list[0:1]:
            if real == prediction_1:
                true_pred_1 = 1
        total_1 += true_pred_1
        
        for prediction_5 in predicted_list[0:5]:
            if real == prediction_5:
                true_pred_5 = 1
        total_5 += true_pred_5
    
    top1_acc = total_1/len(predictions_top_df)
    top5_acc = total_5/len(predictions_top_df)
    print("Top-1 accuracy %.2f"%(top1_acc*100))
    print('Top-5 Accuracy %.2f'%(top5_acc*100))

In [1]:
top1and5_acc_image(image_val_prob, ids_to_class_names)

In [43]:
## Obtain the list of predicted labels from the image dataset, to calculate F1 micro & F1 macro
## The function prints the associated F1 micro & F1 macro scores right away
def f1micromacro(probability_predictions, ids_to_class_names):
    predicted_labels = []
    
    df_proba = pd.DataFrame(probability_predictions)
    prediction_top_proba = []
    for index, row in df_proba.iterrows():
        df_proba = df_proba = df_proba.rename(columns=ids_to_class_names)
        top_predictions = df_proba.sort_values(by=index, axis=1, ascending = False).columns.values[0:5]
        prediction_top_proba.append(top_predictions)

    prediction_top_proba = [list(map(int, row)) for row in prediction_top_proba]
    predictions_top_df = pd.DataFrame({'Y_val2': Y_val2, 'Y_top_proba': prediction_top_proba })
    total_1 = 0
    total_5 = 0
    for index, row in predictions_top_df.iterrows():
        real = predictions_top_df['Y_val2'].loc[index]
        predicted_list = predictions_top_df['Y_top_proba'].loc[index]
        true_pred_1 = 0
        true_pred_5 = 0
        
        for prediction_1 in predicted_list[0:1]:
            predicted_labels.append(prediction_1)
            
            
    print("F1-micro: ", f1_score(Y_val2, predicted_labels, average = 'micro'))
    print("F1-macro: ", f1_score(Y_val2, predicted_labels, average = 'macro'))

In [3]:
f1micromacro(image_val_prob, ids_to_class_names)