In [1]:
from glob import glob

import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


In [8]:
random_state = 42
data_percentage = 0.5 # how much of the data we will use

# clf = MultinomialNB(alpha=0.1)
# name = "MultiNaiveBayes"

# clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)
# name = "RandomForest"

# clf = SVC(random_state=random_state)
# name = "SVM"

clf = VotingClassifier(estimators=[
        ('MNB', MultinomialNB(alpha=0.1)), ('RF', RandomForestClassifier(n_estimators=100, n_jobs=3, random_state=random_state)), ('SVM', SVC(random_state=random_state))], voting='hard')
name = "VotingClassifier"

In [9]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob("../data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)
df = get_SMADC_folder_data()
df = df.sample(frac=data_percentage)

In [10]:
%%time
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Region"], random_state=random_state)
count_vectorizer = CountVectorizer()
X_vectorized = count_vectorizer.fit_transform(X_train)
# standard_scaler = StandardScaler(with_mean=False).fit(X_vectorized)

Wall time: 7.47 s


In [None]:
%%time
model = make_pipeline(count_vectorizer, clf)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
precision, recall, fscore, support = precision_recall_fscore_support(model.predict(X_test), y_test)
for i, region in enumerate(df["Region"].unique()):
    print(f"{region}\tPrecision: {round(precision[i], 4)}, Recall: {round(recall[i], 4)}, FScore: {round(fscore[i], 4)}, Support: {support[i]}")

In [None]:
joblib.dump(model, f"{name}.model")