In [1]:
import re
import string
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from sklearn.utils import resample

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

#from comet_ml import Experiment

# Setting global constants to ensure notebook results are reproducible

RANDOM_STATE = 42


import warnings
warnings.filterwarnings('ignore')



In [2]:
# Load files
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
samplesubmission = pd.read_csv('sample_submission.csv')

# Preview train dataset
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [3]:
# Preview test dataset
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [4]:
# Preview sample submission file
samplesubmission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [5]:
import string
def remove_punc(data, col):
    """
        This function takes in a dataframe and a column, uses python string package to identify and remove all
        punctions in the column. It returns a new dataframe
    """
    def operation(post):
        return ''.join([l for l in post if l not in string.punctuation])
    
    df = data.copy()
    #df[col] = df[col].str.lower()
    
    df['text_no_punc'] = df[col].apply(operation)
    
    return df

In [6]:
# Create a new dataframe with text column void of punctuation
new_train = remove_punc(train, 'text')
new_train.head()

Unnamed: 0,lang_id,text,text_no_punc
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [7]:
def word_converter(data, col):
    """
        This function takes in a dataframe and col, converts all capitalized words in the column to lowercase,
        and returns a new dataframe.
    """
    df = data.copy()
    df['text_lower'] = df[col].str.lower()
    return df

In [8]:
new_train = word_converter(new_train, 'text_no_punc')
new_train.head()

Unnamed: 0,lang_id,text,text_no_punc,text_lower
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [9]:
# Drop every other columns except lang_id and text_lower columns
train_reduced = new_train[['lang_id', 'text_lower']]
train_reduced.head()

Unnamed: 0,lang_id,text_lower
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [10]:
le = LabelEncoder()
train_reduced["lang_id"] = le.fit_transform(train_reduced["lang_id"])
train_reduced.head()

Unnamed: 0,lang_id,text_lower
0,9,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,9,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,1,the province of kwazulunatal department of tra...
3,3,o netefatša gore o ba file dilo ka moka tše le...
4,8,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [11]:
#Extract dependant for data set
y = train_reduced["lang_id"]

In [12]:
cv = TfidfVectorizer(stop_words='english', min_df=2, max_df= 0.9, ngram_range=(1, 2))
X_data = cv.fit_transform(train_reduced['text_lower'].values.astype(str))
X_data.shape

(33000, 195932)

In [13]:
model_df = pd.DataFrame(data=X_data.toarray(),columns = cv.get_feature_names())
#model_df.head()

MemoryError: Unable to allocate 48.2 GiB for an array with shape (33000, 195932) and data type float64

Logistic Regression Model

In [None]:
logreg_model = LogisticRegression(multi_class='ovr')

In [None]:
logreg_model.fit(model_df, y)

Preprocess test data

In [None]:
# Create a new dataframe with text void of punctuation marks
new_test = remove_punc(test, 'text')
new_test.head()

In [None]:
new_test = word_converter(new_test, 'text_no_punc')
new_test.head()

In [None]:
test_reduced = new_test[['index','text_lower']]
test_reduced.head()

In [None]:
# Fit and Transform the text_lower column fit_transform method
X_count_test = cv.fit_transform(test_reduced['text_lower'].values.astype(str))
X_count_test.shape

In [None]:
test_df = pd.DataFrame(data=X_count_test.toarray(),columns = cv.get_feature_names())
#test_df.head()

In [None]:
y_pred_test = logreg_model.predict(test_df)

In [None]:
# Creating a datframe with the lang_id and predicted y columns
df1 = pd.DataFrame({
    'index': test['index'],
    'lang_id': y_pred_test
})

df1.head()

In [None]:
#Converting the lang_id from numeric to text
df1['lang_id'] = le.inverse_transform(df1['lang_id'])

df1.head()

In [None]:
# Downloading the sample csv for logreg_model
df1.to_csv('file_lr.csv',index=False)

AdaBoost Model

In [None]:
ab_model = AdaBoostClassifier()

In [None]:
ab_model.fit(model_df, y)

In [None]:
y_pred_ab = ab_model.predict(test_df)

In [None]:
# Creating a datframe with the lang_id and predicted y columns
df2 = pd.DataFrame({
    'index': test['index'],
    'lang_id': y_pred_ab
})

df2.head()

In [None]:
#Converting the lang_id from numeric to text
df2['lang_id'] = le.inverse_transform(df2['lang_id'])

df2.head()

In [None]:
# Downloading the sample csv for logreg_model
df2.to_csv('file_ab.csv',index=False)

Hyperparameter tuning for the logreg and AdaBoost models

Logistic Regression

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(model_df, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
#Instantiating the best lr model
best_lr_model = LogisticRegression(#best_params)

In [None]:
best_lr_model.fit(model_df, y)

In [None]:
#predicting the best lr model
y_predlr_best = best_lr_model.predict(test_df)

In [None]:
# Creating a datframe with the lang_id and predicted y columns
df3 = pd.DataFrame({
    'index': test['index'],
    'lang_id': y_predlr_best
})

df3.head()

In [None]:
#Converting the lang_id from numeric to text
df3['lang_id'] = le.inverse_transform(df3['lang_id'])

df3.head()

In [None]:
# Downloading the sample csv for best logreg_model
df3.to_csv('file_blr.csv',index=False)

AdaBoost Classifier

In [None]:
# define the model with default hyperparameters
model = AdaBoostClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(model_df, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
#Instantiating the best lr model
best_ab_model = AdaBoostClassifier(#best_params)

In [None]:
best_ab_model.fit(model_df, y)

In [None]:
#predicting the best AdaBoost model
y_predab_best = best_ab_model.predict(test_df)

In [None]:
# Creating a datframe with the lang_id and predicted y columns
df4 = pd.DataFrame({
    'index': test['index'],
    'lang_id': y_predab_best
})

df4.head()

In [None]:
#Converting the lang_id from numeric to text
df4['lang_id'] = le.inverse_transform(df4['lang_id'])

df4.head()

In [None]:
# Downloading the sample csv for best logreg_model
df4.to_csv('file_bab.csv',index=False)