# Text classifier

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import pandas as pd
import pickle
import numpy as np

In [60]:
random_state = 40

### Data

In [3]:
df = pd.read_csv("./wikipedia_data.csv")

In [4]:
df.head()

Unnamed: 0,category,page,text,label
0,Finance,Portal:Banks,"The human back, also called the dorsum, is the...",3
1,Finance,Finance,Finance is the study of money and assets. It i...,3
2,Finance,Approved Publication Arrangement,With MiFID II directive being in force in Janu...,3
3,Finance,Asset,"In financial accounting, an asset is any resou...",3
4,Finance,Austerity,Austerity is a set of political-economic polic...,3


In [61]:
def get_class_map(df):
    class_map = {}
    
    cats = df['category'].unique().tolist()
    for cat in cats:
        label_val = df[df['category']==cat].iloc[0]['label']
        class_map[label_val] = cat
    return dict(sorted(class_map.items()))

class_map = get_class_map(df)
class_map

{0: 'Business Administration',
 1: 'Life Science',
 2: 'Technology',
 3: 'Finance'}

In [62]:
df['category'].value_counts()

Life Science               535
Technology                 462
Finance                    175
Business Administration    141
Name: category, dtype: int64

### Get stratified, randomly split test train datasets

In [63]:
cats = df['category'].unique().tolist()
min_cat_samples = df['category'].value_counts().min()
num_train_samples = int(np.floor(min_cat_samples*0.9))
num_test_samples = min_cat_samples - num_train_samples

print(f"Each category will have train size {num_train_samples} and test size {num_test_samples}")

df_train = pd.DataFrame()
df_test = pd.DataFrame()
                           
for cat in cats:
    df_cat = df[df['category']==cat]
    train_cat = df_cat.sample(num_train_samples)
    test_cat = df_cat[~df_cat.index.isin(train_cat.index)].sample(num_test_samples,random_state=random_state)
                           
    df_train = pd.concat([df_train,train_cat])
    df_test = pd.concat([df_test,test_cat])
    
print(f"{df_train.shape=}")
print(f"{df_test.shape=}")

Each category will have train size 126 and test size 15
df_train.shape=(504, 4)
df_test.shape=(60, 4)


In [64]:
# df_train,df_test = train_test_split(df,test_size=0.1,stratify='label',random_state=42)

In [65]:
df_train.to_csv("./wikipedia_train_data.csv",index=False)
df_test.to_csv("./wikipedia_test_data.csv",index=False)

In [75]:
X_train = df_train['text'].tolist()
y_train = df_train['label'].tolist()

print(f"{len(X_train)=}")
print(f"{len(y_train)=}")

X_test = df_test['text'].tolist()
y_test = df_test['label'].tolist()

print(f"{len(X_test)=}")
print(f"{len(y_test)=}")

len(X_train)=504
len(y_train)=504
len(X_test)=60
len(y_test)=60


### Classifier

In [73]:
model = Pipeline([('tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,2),lowercase=True)),
                  ('voting_ensemble',VotingClassifier(estimators=[('rf',RandomForestClassifier()),('lr',LogisticRegression()),('nb',GaussianNB())]))
                 ])

In [89]:
model = Pipeline([('tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,2),lowercase=True)),
                  ('clf', VotingClassifier(estimators=[("pip1", GradientBoostingClassifier(n_estimators=100, random_state=7)), 
                                                         ("pip2", SVC(probability=True)), 
                                                         ("pip3", RandomForestClassifier())],
                                           voting="soft"))
                 ])

In [90]:
model.fit(X_train,y_train)

In [43]:
model.fit(df['text'].tolist(),df['label'].tolist())

In [91]:
model.score(X_test,y_test)

0.9333333333333333

In [92]:
class_map[model.predict(["when protein expression rises beyond these levels we typically see an increased regulation in."])[0]]

'Life Science'

In [93]:
with open("./text_classifier.pkl", "wb") as file:
    pickle.dump(model,file)