# Random Forest TF-IDF

### Import Library

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing

### Data Loading

In [2]:
df = pd.read_csv("./output/data_clean.csv")

In [3]:
df['text_final'].isnull().sum()

np.int64(1)

In [4]:
df.dropna(subset=['content'], inplace=True)

In [5]:
df.isna().sum()

content                 0
score                   0
text_clean              0
text_casefoldingText    0
text_slangwords         0
text_tokenizingText     0
text_stopword           0
text_final              0
polarity_score          0
polarity                0
dtype: int64

In [6]:
label_encoder = preprocessing.LabelEncoder() 
df['polarity_encode'] = label_encoder.fit_transform(df['polarity']) 

### Embedding

TF-IDF

In [7]:
X_input = df['text_final']
y_input = df['polarity_encode']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.2, stratify=y_input, random_state=42)

In [9]:
tfidf = TfidfVectorizer(
    max_features=5000, 
    min_df=3, 
    max_df=0.85,
    ngram_range=(1,2),
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Modelling

Algoritma Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
 
random_forest = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    max_features='sqrt',
    class_weight='balanced',
)

random_forest.fit(X_train_tfidf.toarray(), y_train)
 
y_pred_train_rf = random_forest.predict(X_train_tfidf.toarray())
y_pred_test_rf = random_forest.predict(X_test_tfidf.toarray())
 
accuracy_train_rf = accuracy_score(y_pred_train_rf, y_train)
accuracy_test_rf = accuracy_score(y_pred_test_rf, y_test)

In [11]:
print('accuracy_train:', accuracy_train_rf)
print('accuracy_test:', accuracy_test_rf)
print(classification_report(y_test, y_pred_test_rf))
print(confusion_matrix(y_test, y_pred_test_rf))

accuracy_train: 0.9093655589123867
accuracy_test: 0.8708333333333333
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1004
           1       0.26      0.27      0.27        55
           2       0.92      0.88      0.90      1341

    accuracy                           0.87      2400
   macro avg       0.68      0.68      0.68      2400
weighted avg       0.87      0.87      0.87      2400

[[ 895   11   98]
 [  29   15   11]
 [ 130   31 1180]]


Save model

In [13]:
import joblib

joblib.dump(tfidf, './assets/tf_idf.joblib')
joblib.dump(random_forest, './assets/rf_tfidf.joblib')

['./assets/rf_tfidf.joblib']