In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df = df.fillna('')
df = df.drop(['id', 'title', 'author'], axis=1)

In [4]:
port_stem = PorterStemmer()

In [5]:
def stemming(content):
    con = re.sub('[^a-zA-Z]', ' ', content)
    con = con.lower()
    con = con.split()
    con = [port_stem.stem(word) for word in con if not word in stopwords.words('english')]
    con = ' '.join(con)
    return con

In [6]:
df['text'] = df['text'].apply(stemming)

In [7]:
x = df['text']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)


In [8]:
vect = TfidfVectorizer()
x_train_vect = vect.fit_transform(x_train)
x_test_vect = vect.transform(x_test)

In [9]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train_vect, y_train)
dt_score = dt_model.score(x_test_vect, y_test)
print(dt_score)

0.8853365384615385


In [10]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train_vect, y_train)
rf_score = rf_model.score(x_test_vect, y_test)
print(rf_score)

0.9173076923076923


In [11]:
if dt_score > rf_score:
    final_model = dt_model
    final_vectorizer = vect
else:
    final_model = rf_model
    final_vectorizer = vect

In [12]:
pickle.dump(final_vectorizer, open('final_vectorizer.pkl', 'wb'))
pickle.dump(final_model, open('final_model.pkl', 'wb'))

In [18]:
from sklearn.metrics import precision_score
dt_predictions = dt_model.predict(x_test_vect)
dt_precision = precision_score(y_test, dt_predictions, average='binary', pos_label=1)
print(f'Decision Tree Precision: {dt_precision}')

Decision Tree Precision: 0.8748187530207829


In [19]:
rf_predictions = rf_model.predict(x_test_vect)
rf_precision = precision_score(y_test, rf_predictions, average='binary', pos_label=1)
print(f'Random Forest Precision: {rf_precision}')

Random Forest Precision: 0.9376299376299376
