In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, confusion_matrix





In [2]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [3]:
def remove_links(text):
    '''Takes a string and removes web links from it'''
    text = re.sub(r'http\S+', '', text) # remove http links
    text = re.sub(r'bit.ly/\S+', '', text) # rempve bitly links
    return text 

punct = string.punctuation
def preprocess(sent):
    sent = remove_links(sent)
    sent = sent.lower() # lower case
    sent = re.sub('['+punct + ']+', ' ', sent) # strip punctuation
    sent = re.sub('\s+', ' ', sent.strip()) #remove any weird spacing
    sent = re.sub('([0-9]+)', '', sent) # remove numbers
    sent_token_list = [word for word in sent.split(' ')]
    sent = ' '.join(sent_token_list)
    return sent

In [4]:
df = train.copy()
df_test = test.copy()

In [5]:
df['clean_text']=df['text'].apply(lambda x: preprocess(x))
df_test['clean_text']=df_test['text'].apply(lambda x: preprocess(x))

In [6]:
x1 = pd.Series(df['clean_text'])
x2 = pd.Series(df_test['clean_text'])
y = df['lang_id']

In [7]:
X = pd.concat([x1,x2])

In [8]:
X.head()

0    umgaqo siseko wenza amalungiselelo kumaziko ax...
1    i dha iya kuba nobulumko bokubeka umsebenzi na...
2    the province of kwazulu natal department of tr...
3    o netefatša gore o ba file dilo ka moka tše le...
4    khomishini ya ndinganyiso ya mbeu yo ewa maana...
Name: clean_text, dtype: object

In [9]:
le = LabelEncoder()
y_le = le.fit_transform(y)

In [57]:
list(le.classes_)

['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul']

In [55]:
list(le.inverse_transform(y_le))

['xho',
 'xho',
 'eng',
 'nso',
 'ven',
 'nso',
 'tsn',
 'ven',
 'nso',
 'tsn',
 'nbl',
 'ven',
 'zul',
 'ssw',
 'zul',
 'nso',
 'tso',
 'zul',
 'sot',
 'nbl',
 'tsn',
 'tsn',
 'tso',
 'eng',
 'ssw',
 'zul',
 'nbl',
 'ssw',
 'xho',
 'sot',
 'eng',
 'sot',
 'afr',
 'tso',
 'ssw',
 'tsn',
 'zul',
 'eng',
 'tso',
 'tsn',
 'afr',
 'eng',
 'afr',
 'xho',
 'tsn',
 'sot',
 'tso',
 'ssw',
 'nbl',
 'afr',
 'tsn',
 'tso',
 'ssw',
 'afr',
 'tso',
 'sot',
 'zul',
 'nso',
 'nso',
 'xho',
 'afr',
 'nso',
 'afr',
 'tso',
 'tsn',
 'tso',
 'tso',
 'ssw',
 'zul',
 'nbl',
 'tso',
 'nbl',
 'nbl',
 'xho',
 'ssw',
 'ssw',
 'sot',
 'eng',
 'zul',
 'xho',
 'sot',
 'ven',
 'nbl',
 'tsn',
 'tso',
 'sot',
 'eng',
 'tsn',
 'ven',
 'tsn',
 'nso',
 'zul',
 'sot',
 'tsn',
 'nbl',
 'nso',
 'ssw',
 'sot',
 'tsn',
 'nbl',
 'ven',
 'tso',
 'ven',
 'eng',
 'eng',
 'tso',
 'tsn',
 'zul',
 'tsn',
 'zul',
 'afr',
 'afr',
 'sot',
 'eng',
 'nso',
 'ven',
 'nbl',
 'sot',
 'nbl',
 'tsn',
 'afr',
 'zul',
 'ven',
 'ven',
 'sot',


In [134]:
cv = TfidfVectorizer(min_df=2,max_df=0.9,ngram_range=(1, 2))
X_data = cv.fit_transform(X)


In [135]:
X_data1 = X_data[0:33000]

In [136]:
X_data2 = X_data[33000: , :]

In [137]:
X_data2.shape

(5682, 227514)

In [138]:
test.shape

(5682, 2)

In [139]:
X_train,x_test,y_train,y_test = train_test_split(X_data1,y_le,test_size = 0.3,random_state = 42)

#### Logistic Regression

In [140]:
lr = LogisticRegression(multi_class='ovr',n_jobs=1,C=1e5,max_iter=4000)

In [141]:
lr.fit(X_train, y_train)

lr_pred = lr.predict(x_test)

print("Training Accuracy :", lr.score(X_train, y_train))
print("Validation Accuracy :", lr.score(x_test, y_test))

Training Accuracy : 1.0
Validation Accuracy : 0.9973737373737374


In [142]:
from sklearn.metrics import classification_report
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       895
           1       1.00      1.00      1.00       909
           2       0.99      1.00      0.99       879
           3       1.00      1.00      1.00       941
           4       1.00      1.00      1.00       918
           5       1.00      1.00      1.00       908
           6       1.00      1.00      1.00       882
           7       1.00      1.00      1.00       857
           8       1.00      1.00      1.00       936
           9       0.99      1.00      0.99       922
          10       0.99      0.99      0.99       853

    accuracy                           1.00      9900
   macro avg       1.00      1.00      1.00      9900
weighted avg       1.00      1.00      1.00      9900



In [143]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = le.inverse_transform(lr.predict(X_data2))
submission_df.to_csv('lr.csv', index=False)

#### Random Forest

In [20]:
rfc = RandomForestClassifier()

In [21]:
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(x_test)

print("Training Accuracy :", rfc.score(X_train, y_train))
print("Validation Accuracy :", rfc.score(x_test, y_test))

Training Accuracy : 1.0
Validation Accuracy : 0.9863636363636363


In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

         afr       0.99      1.00      1.00       895
         eng       1.00      1.00      1.00       909
         nbl       0.99      0.95      0.97       879
         nso       1.00      0.99      1.00       941
         sot       1.00      1.00      1.00       918
         ssw       0.97      0.98      0.98       908
         tsn       0.99      1.00      1.00       882
         tso       1.00      1.00      1.00       857
         ven       1.00      1.00      1.00       936
         xho       0.98      0.97      0.97       922
         zul       0.93      0.96      0.95       853

    accuracy                           0.99      9900
   macro avg       0.99      0.99      0.99      9900
weighted avg       0.99      0.99      0.99      9900



In [23]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = rfc.predict(X_data2)
submission_df.to_csv('rfc.csv', index=False)

In [24]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [25]:
# grid_search.fit(X_train, y_train)


In [26]:
# best_grid = grid_search.best_estimator_
#grid_accuracy = evaluate(best_grid, test_features, test_labels)

In [27]:
# best_grid

In [28]:
# rfc1 = RandomForestClassifier(max_depth=90, max_features='', min_samples_leaf=3,
#                        min_samples_split=8, n_estimators=300)

In [29]:
# rfc1.fit(X_train, y_train)

# rfc1_pred = rfc1.predict(x_test)

# print("Training Accuracy :", rfc1.score(X_train, y_train))
# print("Validation Accuracy :", rfc1.score(x_test, y_test))

In [144]:
from sklearn.naive_bayes import MultinomialNB

In [145]:
mnb = MultinomialNB(alpha = 0.017)

mnb.fit(X_train, y_train)

mnb_pred = mnb.predict(x_test)

print("Training Accuracy :", mnb.score(X_train, y_train))
print("Validation Accuracy :", mnb.score(x_test, y_test))

Training Accuracy : 1.0
Validation Accuracy : 0.9986868686868687


In [146]:
from sklearn.metrics import classification_report
print(classification_report(y_test, mnb_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       895
           1       1.00      1.00      1.00       909
           2       1.00      1.00      1.00       879
           3       1.00      1.00      1.00       941
           4       1.00      1.00      1.00       918
           5       1.00      1.00      1.00       908
           6       1.00      1.00      1.00       882
           7       1.00      1.00      1.00       857
           8       1.00      1.00      1.00       936
           9       1.00      1.00      1.00       922
          10       1.00      0.99      1.00       853

    accuracy                           1.00      9900
   macro avg       1.00      1.00      1.00      9900
weighted avg       1.00      1.00      1.00      9900



In [147]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = le.inverse_transform(mnb.predict(X_data2))
submission_df.to_csv('mnb5.csv', index=False)

In [41]:
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [63]:
xlf = xgb.XGBClassifier(num_class=7,
                                  learning_rate=0.1,
                                  num_iterations=1000,
                                  max_depth=10,
                                  feature_fraction=0.7, 
                                  scale_pos_weight=1.5,
                                  boosting='gbdt',
                                  metric='multiclass',
                                  eval_metric='mlogloss')

xlf.fit(X_train, y_train)

xlf_pred = xlf.predict(x_test)

print("Training Accuracy :", xlf.score(X_train, y_train))
print("Validation Accuracy :", xlf.score(x_test, y_test))



Parameters: { "boosting", "feature_fraction", "metric", "num_iterations", "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Training Accuracy : 0.98991341991342
Validation Accuracy : 0.9722222222222222


In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test, xlf_pred))

              precision    recall  f1-score   support

         afr       0.99      0.99      0.99       895
         eng       0.99      1.00      1.00       909
         nbl       0.95      0.95      0.95       879
         nso       1.00      0.99      0.99       941
         sot       1.00      1.00      1.00       918
         ssw       0.96      0.96      0.96       908
         tsn       0.99      0.99      0.99       882
         tso       1.00      1.00      1.00       857
         ven       1.00      1.00      1.00       936
         xho       0.98      0.95      0.96       922
         zul       0.92      0.95      0.93       853

    accuracy                           0.98      9900
   macro avg       0.98      0.98      0.98      9900
weighted avg       0.98      0.98      0.98      9900



In [46]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = xlf.predict(X_data2)
submission_df.to_csv('xlf.csv', index=False)

In [64]:
from sklearn.ensemble import AdaBoostClassifier

In [65]:
clf = AdaBoostClassifier()

clf.fit(X_train, y_train)

clf_pred = clf.predict(x_test)

print("Training Accuracy :", clf.score(X_train, y_train))
print("Validation Accuracy :", clf.score(x_test, y_test))

Training Accuracy : 0.6542857142857142
Validation Accuracy : 0.6609090909090909


In [66]:
from sklearn.neighbors import KNeighborsClassifier

In [85]:

knn = KNeighborsClassifier(n_neighbors=59)


knn.fit(X_train, y_train)

knn_pred = clf.predict(x_test)

print("Training Accuracy :", knn.score(X_train, y_train))
print("Validation Accuracy :", knn.score(x_test, y_test))

Training Accuracy : 0.9896536796536797
Validation Accuracy : 0.9887878787878788


In [86]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = le.inverse_transform(knn.predict(X_data2))
submission_df.to_csv('knn1.csv', index=False)