<a href="https://colab.research.google.com/github/gregoriusavip/AI-text-classifier/blob/data-cleansing/train_model/Sentiment_analysis_using_Scikit_RFC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import accuracy_score
# function for transforming documents into counts
from sklearn.feature_extraction.text import CountVectorizer
# function for encoding categories
from sklearn.preprocessing import LabelEncoder

In [None]:
train_data = pd.read_csv("/content/twitter_training.csv")
test_data = pd.read_csv("/content/twitter_validation.csv")

In [None]:
#Check if there are null data
print(train_data.isnull().sum())

id           0
type         0
sentiment    0
text         0
dtype: int64


In [None]:
print(test_data.isnull().sum())

id           0
type         0
sentiment    0
text         0
dtype: int64


In [None]:
train_data = train_data.dropna()
#Check if all null data removed
print(train_data.isnull().sum())

id           0
type         0
sentiment    0
text         0
dtype: int64


In [None]:
# pull the data into vectors
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(train_data['text'])
x_test = vectorizer.transform(test_data['text'])

encoder = LabelEncoder()
y_train = encoder.fit_transform(train_data['sentiment'])
y_test = encoder.transform(test_data['sentiment'])

# take a look at the shape of each of these
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(73996, 31062)
(73996,)
(1000, 31062)
(1000,)


In [None]:
# create a Random Forest Classifier object
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# fit the classifier to the training data
rfc.fit(x_train, y_train)

# make predictions on the test data
y_pred = rfc.predict(x_test)

In [None]:
# calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.968


In [None]:
docs_new = ["I hate you", "that was cool!", "I'm sorry"]
X_new_counts = vectorizer.transform(docs_new)
predicted = rfc.predict(X_new_counts)

print(predicted)

[1 3 0]


In [None]:
predicted_categories = encoder.inverse_transform(predicted)
print(predicted_categories)

['Negative' 'Positive' 'Irrelevant']


In [None]:
import joblib

In [None]:
filename = 'rfc_sentiment_model.joblib'
joblib.dump(rfc, filename)

['rfc_sentiment_model.joblib']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# save the model to Google Drive
filename = '/content/drive/My Drive/rfc_sentiment_model.joblib'
joblib.dump(rfc, filename)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['/content/drive/My Drive/rfc_sentiment_model.joblib']

In [None]:
filename = '/content/drive/My Drive/rfc_vectorizer.joblib'
joblib.dump(vectorizer, filename)

['/content/drive/My Drive/rfc_vectorizer.joblib']

In [None]:
filename = 'rfc_vectorizer.joblib'
joblib.dump(vectorizer, filename)

['rfc_vectorizer.joblib']

In [None]:
filename = '/content/drive/My Drive/rfc_encoder.joblib'
joblib.dump(encoder, filename)

['/content/drive/My Drive/rfc_encoder.joblib']

In [None]:
filename = 'rfc_encoder.joblib'
joblib.dump(encoder, filename)

['rfc_encoder.joblib']