# Aprroach 1

## Import Libraries

In [1]:
# Ignore warnings
import warnings
warnings.simplefilter(action='ignore')

# Install Prerequisites
# import sys
# import nltk
# !{sys.executable} -m pip install bs4 lxml wordcloud scikit-learn scikit-plot
# nltk.download('vader_lexicon')

# Exploratory Data Analysis
import re
import ast
import time
import nltk
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
#from textblob import TextBlob
import matplotlib.pyplot as plt
#from wordcloud import WordCloud
from nltk.sentiment import SentimentIntensityAnalyzer

# Data Preprocessing
import string
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.utils import resample
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Classification Models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Performance Evaluation
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
#from scikitplot.metrics import plot_roc, plot_confusion_matrix
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

# Display
%matplotlib inline
sns.set(font_scale=1)
sns.set_style("white")

## Import Data

In [35]:
train_data = pd.read_csv('train_set.csv')
test_data = pd.read_csv('test_set.csv')
df_train = train_data.copy() #For EDA on raw data
df_test = test_data.copy()

In [3]:
train_data.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
test_data.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


## Clean Data

In [36]:
def clean(df):
    """
    Apply data cleaning steps to raw data.
    """
    df['token'] = df['text'].apply(TweetTokenizer().tokenize) ## first we tokenize
    df['punc'] = df['token'].apply(lambda x : [i for i in x if i not in string.punctuation])## remove punctuations
    df['dig'] = df['punc'].apply(lambda x: [i for i in x if i not in list(string.digits)]) ## remove digits
    df['final'] = df['dig'].apply(lambda x: [i for i in x if len(i) > 1]) ## remove all words with only 1 character
    return df['final']
train_data['final'] = clean(train_data)
test_data['final'] = clean(test_data)

In [37]:
def get_part_of_speech(word):
    """
    Find part of speech of word if part of speech is either noun, verb, adjective etc and add it to a list.
    """
    probable_part_of_speech = wordnet.synsets(word) ## finding word that is most similar (synonyms) for semantic reasoning
    pos_counts = Counter() ## instantiating our counter class
    pos_counts["n"] = len([i for i in probable_part_of_speech if i.pos()=="n"])
    pos_counts["v"] = len([i for i in probable_part_of_speech if i.pos()=="v"])
    pos_counts["a"] = len([i for i in probable_part_of_speech if i.pos()=="a"])
    pos_counts["r"] = len([i for i in probable_part_of_speech if i.pos()=="r"])
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0] ## will extract the most likely part of speech from the list
    return most_likely_part_of_speech

normalizer = WordNetLemmatizer()

train_data['final'] = train_data['final'].apply(lambda x: [normalizer.lemmatize(token, get_part_of_speech(token)) for token in x])
test_data['final'] = test_data['final'].apply(lambda x: [normalizer.lemmatize(token, get_part_of_speech(token)) for token in x])

In [9]:
train_data.head()

Unnamed: 0,lang_id,text,token,punc,dig,final
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik...","[umgaqo-siseko, wenza, amalungiselelo, kumazik..."
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe...","[i-dha, iya, kuba, nobulumko, bokubeka, umsebe..."
2,eng,the province of kwazulu-natal department of tr...,"[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,...","[the, province, of, kwazulu-natal, department,..."
3,nso,o netefatša gore o ba file dilo ka moka tše le...,"[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[o, netefatša, gore, o, ba, file, dilo, ka, mo...","[netefatša, gore, ba, file, dilo, ka, moka, tš..."
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew...","[khomishini, ya, ndinganyiso, ya, mbeu, yo, ew..."


## Split data

In [38]:
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state = 42)

## Feature Engineering

In [39]:
X_train = list(X_train.apply(' '.join))
X_val = list(X_val.apply(' '.join))

vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))
vectorizer.fit(X_train)

# vect_save_path = "TfidfVectorizer.pkl"
# with open(vect_save_path,'wb') as file:
#     pickle.dump(vectorizer,file)

X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

## Modelling

### Logistic Regression

In [12]:
modelstart = time.time()
logreg = LogisticRegression(C=1000, multi_class='ovr', solver='saga', random_state=42, max_iter=10)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
logreg_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
# results.to_csv("logreg_report.csv")
results
# model_save_path = "logreg_model.pkl"
# with open(model_save_path,'wb') as file:
#     pickle.dump(logreg,file)

Accuracy 0.9945454545454545
Model Runtime: 5.91 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.992908,0.996441,0.994671,281.0
eng,1.0,1.0,1.0,297.0
nbl,0.993769,0.975535,0.984568,327.0
nso,1.0,0.990683,0.99532,322.0
sot,0.996753,1.0,0.998374,307.0
ssw,0.993031,0.996503,0.994764,286.0
tsn,0.993289,0.996633,0.994958,297.0
tso,1.0,1.0,1.0,253.0
ven,1.0,1.0,1.0,322.0
xho,0.993651,1.0,0.996815,313.0


In [14]:
# Make prediction on test data
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']

X = list(X.apply(' '.join))
X_test = list(X_test.apply(' '.join))

vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))
vectorizer.fit(X)

X = vectorizer.transform(X)
X_test = vectorizer.transform(X_test)


logreg = LogisticRegression(C=1000, multi_class='ovr', solver='saga', random_state=42, max_iter=10)
logreg.fit(X, y)
y_test = logreg.predict(X_test)

In [15]:
# Create Kaggle Submission File
results = pd.DataFrame({"index":test_data['index'],"lang_id": y_test})
results.to_csv("D:\Temp\initial_submission.csv", index=False)

In [None]:
#Without parameters

In [22]:
modelstart = time.time()
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
logreg_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
# results.to_csv("logreg_report.csv")
results
# model_save_path = "logreg_model.pkl"
# with open(model_save_path,'wb') as file:
#     pickle.dump(logreg,file)

Accuracy 0.9951515151515151
Model Runtime: 25.31 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.996429,0.992883,0.994652,281.0
eng,1.0,1.0,1.0,297.0
nbl,0.996904,0.984709,0.990769,327.0
nso,1.0,0.990683,0.99532,322.0
sot,1.0,1.0,1.0,307.0
ssw,0.993007,0.993007,0.993007,286.0
tsn,0.993289,0.996633,0.994958,297.0
tso,1.0,1.0,1.0,253.0
ven,1.0,1.0,1.0,322.0
xho,0.984277,1.0,0.992076,313.0


In [23]:
# Make prediction on test data
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']

X = list(X.apply(' '.join))
X_test = list(X_test.apply(' '.join))

vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)
X_test = vectorizer.transform(X_test)


logreg = LogisticRegression()
logreg.fit(X, y)
y_test = logreg.predict(X_test)

In [None]:
# Create Kaggle Submission File
results = pd.DataFrame({"index":test_data['index'],"lang_id": y_test})
results.to_csv("D:\Temp\initial_submission2.csv", index=False)

### SVM

In [16]:
modelstart = time.time()
svc = SVC(gamma = 0.8, C = 10, random_state=42)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
svc_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
# results.to_csv("svc_report.csv")
results
# model_save_path = "svc_model.pkl"
# with open(model_save_path,'wb') as file:
#     pickle.dump(svc,file)

Accuracy 0.9954545454545455
Model Runtime: 320.77 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.996429,0.992883,0.994652,281.0
eng,1.0,1.0,1.0,297.0
nbl,1.0,0.978593,0.989181,327.0
nso,1.0,0.990683,0.99532,322.0
sot,0.996753,1.0,0.998374,307.0
ssw,0.996503,0.996503,0.996503,286.0
tsn,0.993289,0.996633,0.994958,297.0
tso,1.0,1.0,1.0,253.0
ven,1.0,1.0,1.0,322.0
xho,0.990506,1.0,0.995231,313.0


In [17]:
# Make prediction on test data
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']

X = list(X.apply(' '.join))
X_test = list(X_test.apply(' '.join))

vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))
vectorizer.fit(X)

X = vectorizer.transform(X)
X_test = vectorizer.transform(X_test)

svc = SVC(gamma = 0.8, C = 10, random_state=42)
svc.fit(X, y)
y_test = svc.predict(X_test)

In [24]:
# Create Kaggle Submission File
results = pd.DataFrame({"index":test_data['index'],"lang_id": y_test})
results.to_csv("D:\Temp\initial_submission4.csv", index=False)

In [None]:
# Without parameters

In [None]:
modelstart = time.time()
svc = SVC(random_state=42)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_val)
svc_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
# results.to_csv("svc_report.csv")
results
# model_save_path = "svc_model.pkl"
# with open(model_save_path,'wb') as file:
#     pickle.dump(svc,file)

In [26]:
# Make prediction on test data
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']

X = list(X.apply(' '.join))
X_test = list(X_test.apply(' '.join))

vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))
vectorizer.fit(X)

X = vectorizer.transform(X)
X_test = vectorizer.transform(X_test)

svc = SVC()
svc.fit(X, y)
y_test = svc.predict(X_test)

In [27]:
# Create Kaggle Submission File
results = pd.DataFrame({"index":test_data['index'],"lang_id": y_test})
results.to_csv("D:\Temp\initial_submission5.csv", index=False)

## Linear SVC

In [19]:
modelstart = time.time() 
linsvc = LinearSVC()
linsvc.fit(X_train, y_train)
y_pred = linsvc.predict(X_val)
linsvc_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
# results.to_csv("linsvc_report.csv")
results
# model_save_path = "linsvc_model.pkl"
# with open(model_save_path,'wb') as file:
#     pickle.dump(linsvc,file)

Accuracy 0.9960606060606061
Model Runtime: 1.83 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.996454,1.0,0.998224,281.0
eng,1.0,1.0,1.0,297.0
nbl,1.0,0.978593,0.989181,327.0
nso,1.0,0.993789,0.996885,322.0
sot,0.996753,1.0,0.998374,307.0
ssw,0.996503,0.996503,0.996503,286.0
tsn,0.996633,0.996633,0.996633,297.0
tso,1.0,1.0,1.0,253.0
ven,1.0,1.0,1.0,322.0
xho,0.990506,1.0,0.995231,313.0


In [None]:
# Make prediction on test data
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']

X = list(X.apply(' '.join))
X_test = list(X_test.apply(' '.join))

vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))
vectorizer.fit(X)

X = vectorizer.transform(X)
X_test = vectorizer.transform(X_test)

linsvc = LinearSVC()
linsvc.fit(X, y)
y_test = linsvc.predict(X_test)

In [21]:
# Create Kaggle Submission File
results = pd.DataFrame({"index":test_data['index'],"lang_id": y_test})
results.to_csv("D:\Temp\initial_submission3.csv", index=False)

## Multinomial Naive Bayes

In [40]:
modelstart= time.time()
multinb = MultinomialNB()
multinb.fit(X_train, y_train)
y_pred = multinb.predict(X_val)
multinb_f1 = round(f1_score(y_val, y_pred, average='weighted'),2)
print('Accuracy %s' % accuracy_score(y_pred, y_val))
print("Model Runtime: %0.2f seconds"%((time.time() - modelstart)))
report = classification_report(y_val, y_pred, output_dict=True)
results = pd.DataFrame(report).transpose()
# results.to_csv("multinb_report.csv")
results
# model_save_path = "multinb_model.pkl"
# with open(model_save_path,'wb') as file:
#     pickle.dump(multinb,file)

Accuracy 0.9978787878787879
Model Runtime: 0.14 seconds


Unnamed: 0,precision,recall,f1-score,support
afr,0.996454,1.0,0.998224,281.0
eng,1.0,1.0,1.0,297.0
nbl,0.996914,0.987768,0.99232,327.0
nso,1.0,0.996894,0.998445,322.0
sot,1.0,1.0,1.0,307.0
ssw,1.0,1.0,1.0,286.0
tsn,0.996633,0.996633,0.996633,297.0
tso,1.0,1.0,1.0,253.0
ven,1.0,1.0,1.0,322.0
xho,1.0,1.0,1.0,313.0


In [41]:
# Make prediction on test data
X = train_data['final']
y = train_data['lang_id']
X_test = test_data['final']

X = list(X.apply(' '.join))
X_test = list(X_test.apply(' '.join))

vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, ngram_range = (1, 2))
vectorizer.fit(X)

X = vectorizer.transform(X)
X_test = vectorizer.transform(X_test)

multinb = MultinomialNB()
multinb.fit(X, y)
y_test = multinb.predict(X_test)

In [42]:
# Create Kaggle Submission File
results = pd.DataFrame({"index":test_data['index'],"lang_id": y_test})
results.to_csv("D:\Temp\initial_submission6.csv", index=False)

# Approach 2

In [81]:
df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

In [82]:
# Preparing
X = df['text'].astype(str)
y = df['lang_id']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05,random_state=1)

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [85]:
# Different Models
LogReg = LogisticRegression()
LinSVC = LinearSVC()
NB = MultinomialNB()
onevrest = OneVsRestClassifier(LinearSVC(),n_jobs=4)
vectorizer = TfidfVectorizer(
                             min_df=2, 
                             max_df=0.9,strip_accents='unicode',
                             analyzer='word',
                             ngram_range=(1, 2))

In [86]:
# Vectorize
tfidf = vectorizer.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

In [87]:
# Fit Models
linsvcmodel = LinSVC.fit(X_train,y_train) #fits this pipeline using the training data
naivebayesmodel = NB.fit(X_train,y_train)
logisticregression = LogReg.fit(X_train,y_train)


In [88]:
# Choose Model
text_clf = linsvcmodel

In [89]:
# Predict
predictions = text_clf.predict(X_test)

In [90]:
# Metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
from sklearn import metrics
print(classification_report(y_test,predictions))
print(f"Accuracy score : {accuracy_score(y_test,predictions)}")
print(f"f1 score : {f1_score(y_test,predictions,average='weighted')}")

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       155
         eng       1.00      1.00      1.00       139
         nbl       1.00      0.99      1.00       142
         nso       0.99      1.00      1.00       160
         sot       1.00      0.99      1.00       148
         ssw       1.00      1.00      1.00       168
         tsn       1.00      1.00      1.00       140
         tso       1.00      1.00      1.00       146
         ven       1.00      1.00      1.00       147
         xho       1.00      1.00      1.00       156
         zul       0.99      1.00      1.00       149

    accuracy                           1.00      1650
   macro avg       1.00      1.00      1.00      1650
weighted avg       1.00      1.00      1.00      1650

Accuracy score : 0.9987878787878788
f1 score : 0.9987877382869665


In [91]:
kaggle_predictions = text_clf.predict(vectorizer.transform(test_df["text"].astype(str)))

kaggle_df = pd.DataFrame(
    {'index': test_df['index'],
     'lang_id': kaggle_predictions
    })

kaggle_df.to_csv("D:\Temp\hack_submission2.csv",index=False)

In [92]:
# Choose Model
text_clf = naivebayesmodel

In [93]:
# Predict
predictions = text_clf.predict(X_test)

In [94]:
# Metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
from sklearn import metrics
print(classification_report(y_test,predictions))
print(f"Accuracy score : {accuracy_score(y_test,predictions)}")
print(f"f1 score : {f1_score(y_test,predictions,average='weighted')}")

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       155
         eng       1.00      1.00      1.00       139
         nbl       1.00      0.99      0.99       142
         nso       1.00      1.00      1.00       160
         sot       1.00      1.00      1.00       148
         ssw       1.00      1.00      1.00       168
         tsn       1.00      1.00      1.00       140
         tso       1.00      1.00      1.00       146
         ven       1.00      1.00      1.00       147
         xho       1.00      1.00      1.00       156
         zul       0.99      1.00      0.99       149

    accuracy                           1.00      1650
   macro avg       1.00      1.00      1.00      1650
weighted avg       1.00      1.00      1.00      1650

Accuracy score : 0.9987878787878788
f1 score : 0.9987876208897485


In [95]:
kaggle_predictions = text_clf.predict(vectorizer.transform(test_df["text"].astype(str)))

kaggle_df = pd.DataFrame(
    {'index': test_df['index'],
     'lang_id': kaggle_predictions
    })

kaggle_df.to_csv("D:\Temp\hack_submission3.csv",index=False)

In [96]:
# Choose Model
text_clf = logisticregression

In [97]:
# Predict
predictions = text_clf.predict(X_test)

In [98]:
# Metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
from sklearn import metrics
print(classification_report(y_test,predictions))
print(f"Accuracy score : {accuracy_score(y_test,predictions)}")
print(f"f1 score : {f1_score(y_test,predictions,average='weighted')}")

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       155
         eng       1.00      1.00      1.00       139
         nbl       0.99      0.99      0.99       142
         nso       1.00      1.00      1.00       160
         sot       1.00      0.99      1.00       148
         ssw       1.00      1.00      1.00       168
         tsn       1.00      0.99      1.00       140
         tso       1.00      1.00      1.00       146
         ven       1.00      1.00      1.00       147
         xho       0.99      1.00      0.99       156
         zul       0.99      0.99      0.99       149

    accuracy                           1.00      1650
   macro avg       1.00      1.00      1.00      1650
weighted avg       1.00      1.00      1.00      1650

Accuracy score : 0.996969696969697
f1 score : 0.9969715727573613


In [67]:
kaggle_predictions = text_clf.predict(vectorizer.transform(test_df["text"].astype(str)))

In [68]:
kaggle_df = pd.DataFrame(
    {'index': test_df['index'],
     'lang_id': kaggle_predictions
    })

In [70]:
kaggle_df.shape

(5682, 2)

In [72]:
kaggle_df.to_csv("D:\Temp\hack_submission.csv",index=False)