## EXPLORE Data Science Academy Classification Hackathon
### Overview
South Africa is a multicultural society that is characterised by its rich linguistic diversity. Language is an indispensable tool that can be used to deepen democracy and also contribute to the social, cultural, intellectual, economic and political life of the South African society.

The country is multilingual with 11 official languages, each of which is guaranteed equal status. Most South Africans are multilingual and able to speak at least two or more of the official languages.
From South African Government

This has prompted the need to get a model that can classify the different languages based on the text from tweets

In [None]:
# Packages for data analysis
import numpy as np                     
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
# Customise our plotting settings
sns.set_style('whitegrid')

#Libraries for data cleaning and preprocessing
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
import string
import re
import nltk

#Libraries for data preparation and model building
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Libraries for test of model performance
from sklearn import metrics

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

In [None]:
#load the training and test data set
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')
df_sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
df_train.head()

In [None]:
df_test.iloc[100:200]

In [None]:
df_sample_submission

## Explorative Data Analysis

In [None]:
#creating a copy of the train dataset
df = df_train.copy()

In [None]:
# checking for the data types
df.info()

In [None]:
#checking for null values
df.isnull().sum()

In [None]:
#checking for unbalanced data
df['lang_id'].value_counts()

From the above cell, values show that all the data in different class labels are balanced.

In [None]:
#ploting the distribution of unique label values
f, ax = plt.subplots(figsize=(10, 10))
ax = sns.countplot(x="lang_id", data=df)
plt.show()

In [None]:
#creating a function that cleans the data
def clean_text(text):
    """
    This function uses regular expressions to remove html characters,
    punctuation, numbers and any extra white space from each text
    and then converts them to lowercase.

    Input:
    text: original text
          datatype: string

    Output:
    texts: modified text
           datatype: string
    """
    # replace the html characters with " "
    text=re.sub('<.*?>', ' ', text)
#     Removal of numbers
#    text = re.sub(r'\d+', ' ', text)
    # will replace newline with space
    text = re.sub("\n"," ",text)
    # will convert to lower case
    text = text.lower()
    # will split and join the words
    text=' '.join(text.split())
    return text

In [None]:
# Application of the function to clean the text column
df['text'] = df['text'].apply(clean_text)

In [None]:
# Replace '.txt' with 'text file'
df["text"] = df["text"].str.replace(".txt", " text file")

In [None]:
string.punctuation

In [None]:
# Further Data preprocessing
#function that handles the removal punctuations from the dataset
def remove_punct(text):
    """
    the function remove_punction, it takes in a text as input and loops through
    the text, if a character is not in string.punctuation then it adds the character
    as a string to the text variable
    
    """
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [None]:
#apply the remve_puct func to the tweets column
df['text'] = df['text'].apply(lambda x: remove_punct(x))
df

### Applying same for the test data

In [None]:
#creating a copy of the test dataset
test_df = df_test.copy()

In [None]:
test_df

In [None]:
#applying the cleaning process on the test dataset
test_df['text'] = test_df['text'].apply(clean_text)

In [None]:
#replacing any .txt file to text file
test_df["text"] = test_df["text"].str.replace(".txt", " text file")

In [None]:
#extracting the x features
X = df['text']

In [None]:
#transforming the label(str) to a code
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#Fit label encoder and return encoded labels
y = le.fit_transform(df['lang_id'])

In [None]:
#assigning the labels to a list
type_labels = (le.classes_)

In [None]:
type_labels

In [None]:
#train and validation data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

In [None]:
#instantiating different models to be deployed
"""
Note: Some classifiers were commented out because
they run for a very long time, 
"""
classifiers = [LogisticRegression(random_state=42,
                                  multi_class='ovr',
                                  n_jobs=1,
                                  C=1e5,
                                  max_iter=4000),
               KNeighborsClassifier(n_neighbors=5),
               MultinomialNB(),
               ComplementNB(),
               SGDClassifier(loss='hinge',
                             penalty='l2',
                             alpha=1e-3,
                             random_state=42,
                             max_iter=5,
                             tol=None)
              ]

In [None]:
# class of model classifiers and performance
def classifier_models(classifiers, X_train, y_train, X_test, y_test):
    """
    This function takes in a list of classifiers
    and both the train and validation sets
    and return a summary of F1-score and
    processing time as a dataframe
    """

    model_summary = {}

    # Pipeline to balance the classses and then to build the model
    for clf in classifiers:
        clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                                       max_df=0.9,
                                                       ngram_range=(1, 1))),
                             ('clf', clf)])

        # Logging the Execution Time for each model
        start_time = time.time()
        clf_text.fit(X_train, y_train)
        predictions = clf_text.predict(X_test)
        run_time = time.time()-start_time

        # performance of  each model
        model_summary[clf.__class__.__name__] = {
            'F1-Macro': metrics.f1_score(y_test,
                                         predictions,
                                         average='macro'),
            'F1-Accuracy': metrics.f1_score(y_test, predictions,
                                            average='micro'),
            'F1-Weighted': metrics.f1_score(y_test,
                                            predictions,
                                            average='weighted'),
            'Execution Time': run_time}

    return pd.DataFrame.from_dict(model_summary, orient='index')

In [None]:
classifiers_df = classifier_models(classifiers, X_train, y_train, X_test, y_test)
Order_of_performance = classifiers_df.sort_values('F1-Macro', ascending=False)
Order_of_performance 

In [None]:
# Creating a pipeline for the gridsearch
param_grid = {'alpha': [0.1, 1, 5, 10, 100]}  # setting parameter grid

modified_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

modified_mnb.fit(X_train, y_train)  # model fitting

y_mnb = modified_mnb.predict(X_test)  # predicting the fit on validation set

print(classification_report(y_test, y_mnb,  target_names = type_labels))

In [None]:
# prediction on the test dataset
prediction3 = modified_mnb.predict(test_df['text'])

In [None]:
# saving in a submission csv file
submission_df5 = pd.DataFrame(test_df['index'])
submission_df5['lang_id'] = le.inverse_transform(prediction3)
submission_df5.to_csv('submission_modified_mnb2.csv', index=False)