In [None]:
import numpy as np                     
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
# Customise our plotting settings
sns.set_style('whitegrid')

#Libraries for data cleaning and preprocessing
#from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
import string
import re
import pickle
import nltk

#Libraries for data preparation and model building
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score # Classification report

In [None]:
#load the training and test data set
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

## Explorative Data Analysis

In [None]:
df = df_train.copy()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['lang_id'].value_counts()

In [None]:
#ploting the distribution of unique label values
f, ax = plt.subplots(figsize=(10, 10))
ax = sns.countplot(x="lang_id", data=df)
plt.show()

In [None]:
df['text'].tail(20)

In [None]:
string.punctuation

In [None]:
#Data preprocessing
#function that handles the removal punctuations from the tweets
def remove_punct(text):
    """
    the function remove_punction, it takes in a text as input and loops through
    the text, if a character is not in string.punctuation then it adds the character
    as a string to the text variable
    
    """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [None]:
#removes all websites and replaces them with the text 'web-url'
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df['message_punct'] = df['text'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
#apply the remve_puct func to the tweets column
df['message_punct'] = df['message_punct'].apply(lambda x: remove_punct(x))
df

In [None]:
df['message_punct'].tail(20)

## Feature Engineering

In [None]:
#applying tokenization to the data set
tokeniser = TreebankWordTokenizer()
df['tokens'] = df['message_punct'].apply(tokeniser.tokenize)

In [None]:
#applying Lammetization
lemmatizer = WordNetLemmatizer()
#function that handles the process of lemmatization
def extract_lemma(words, lemmatizer):
    return ' '.join([lemmatizer.lemmatize(word) for word in words]) 

In [None]:
#calling extract_lemma function on the tokens column
df['lemma'] = df['tokens'].apply(extract_lemma, args=(lemmatizer, ))

In [None]:
#using countVectorizer
vectorizer = CountVectorizer(max_features = 27000, analyzer = "word", ngram_range = (1,3))

In [None]:
#transforming the data using the vectorizer
X_count = vectorizer.fit_transform(df['lemma'].values.astype(str))

In [None]:
X_count.shape

In [None]:
X = X_count.toarray()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#Fit label encoder and return encoded labels
y = le.fit_transform(df['lang_id'])

In [None]:
type_labels = (le.classes_)

In [None]:
type_labels

### Applying same for the test data

In [None]:
test_df = df_test.copy()

In [None]:
test_df

In [None]:
#removing http and replacing it with url pattern
test_df['message_punct'] = test_df['text'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
#applying the remove punctuation funtion to the test data
test_df['message_punct'] = test_df['message_punct'].apply(lambda x: remove_punct(x))
test_df.head()

In [None]:
#applying tokenizer
test_df['tokens'] = test_df['message_punct'].apply(tokeniser.tokenize)

In [None]:
#applying the extract_lemma function
test_df['lemma'] = test_df['tokens'].apply(extract_lemma, args=(lemmatizer, ))

In [None]:
#transforming the data using vectorizer
test_count = vectorizer.transform(test_df['lemma'].values.astype(str))

In [None]:
#selecting the feature
x_test = test_count.toarray()

In [None]:
x_test.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [None]:
# Here 'ovr' indicates that we have selected our One-vs-Rest strategy. 
logreg = LogisticRegression(multi_class='ovr')

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, target_names= type_labels))

import warnings
warnings.filterwarnings('ignore')

In [None]:
#making prediction on the test dataset
prediction = logreg.predict(x_test)

In [None]:
#creating a dataframe for the submission
submission = pd.DataFrame(list(zip(test_df['index'],  le.inverse_transform(prediction))), columns = ['index', 'lang_id'])
submission.head()

In [None]:
#saving the file as csv
submission.to_csv('submission_GCE1.csv', index_label = False, index = False)

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(30, 30, 30), max_iter=100000)  
mlp.fit(X_train, y_train)

In [None]:
y_mlp = mlp.predict(X_test)

In [None]:
print(classification_report(y_test, y_mlp, target_names= type_labels))

import warnings
warnings.filterwarnings('ignore')

In [None]:
predictions7 = mlp.predict(x_test)

In [None]:
#creating a dataframe for the submission
submission7 = pd.DataFrame(list(zip(test_df['index'],  le.inverse_transform(predictions7))), columns = ['index', 'lang_id'])
submission7.head()

In [None]:
submission7.to_csv('submission_GCE8.csv', index_label = False, index = False)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
nb = GaussianNB()
# Fit the model 
nb.fit(X_train, y_train)

In [None]:
y_nb = nb.predict(X_test)

In [None]:
print(classification_report(y_test, y_nb, target_names= type_labels))

import warnings
warnings.filterwarnings('ignore')

In [None]:
predictions5 = mlp.predict(x_test)

In [None]:
#creating a dataframe for the submission
submission5 = pd.DataFrame(list(zip(test_df['index'],  le.inverse_transform(predictions5))), columns = ['index', 'lang_id'])
submission5.head()

In [None]:
submission5.to_csv('submission_GCE7.csv', index_label = False, index = False)