In [1]:
from glob import glob

import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


In [7]:
random_state = 42
data_percentage = 0.01 # how much of the data we will use

# clf = MultinomialNB(alpha=0.1)
# clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)
# clf = SVC(random_state=random_state)
clf = VotingClassifier(estimators=[
        ('MNB', MultinomialNB(alpha=0.1)), ('RF', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)), ('SVM', SVC(random_state=random_state))], voting='hard')

In [3]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob("../data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)
df = get_SMADC_folder_data()
df = df.sample(frac=data_percentage)

In [4]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Region"], random_state=random_state)
count_vectorizer = CountVectorizer()
X_vectorized = count_vectorizer.fit_transform(X_train)
# standard_scaler = StandardScaler(with_mean=False).fit(X_vectorized)

Wall time: 193 ms


In [8]:
%%time
model = make_pipeline(count_vectorizer, clf)
model.fit(X_train, y_train)
model.score(X_test, y_test)

Wall time: 22.4 s


0.6740488358886996

In [9]:
precision, recall, fscore, support = precision_recall_fscore_support(model.predict(X_test), y_test)
for i, region in enumerate(df["Region"].unique()):
    print(f"{region}\tPrecision: {round(precision[i], 4)}, Recall: {round(recall[i], 4)}, FScore: {round(fscore[i], 4)}, Support: {support[i]}")

GLF	Precision: 0.9624, Recall: 0.6474, FScore: 0.7741, Support: 2334
EGY	Precision: 0.4543, Recall: 0.701, FScore: 0.5514, Support: 291
IRQ	Precision: 0.3842, Recall: 0.7816, FScore: 0.5152, Support: 174
NOR	Precision: 0.3573, Recall: 0.7373, FScore: 0.4813, Support: 236
LEV	Precision: 0.5272, Recall: 0.7166, FScore: 0.6075, Support: 487


In [10]:
joblib.dump(model, f"{str(model[-1])}.model")

FileNotFoundError: [Errno 2] No such file or directory: "VotingClassifier(estimators=[('MNB', MultinomialNB(alpha=0.1)),\n                             ('RF',\n                              RandomForestClassifier(n_jobs=-1,\n                                                     random_state=42)),\n                             ('SVM', SVC(random_state=42))]).model"

In [11]:
str(model[-1])

"VotingClassifier(estimators=[('MNB', MultinomialNB(alpha=0.1)),\n                             ('RF',\n                              RandomForestClassifier(n_jobs=-1,\n                                                     random_state=42)),\n                             ('SVM', SVC(random_state=42))])"