In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# load dataset
df = pd.read_csv('ratings_comments_pairs.csv')
df = df.sample(frac=0.5)

# data clean
df.dropna(inplace=True)

# data preprocessing
def preprocess_text(text):
    text = text.lower()  # lower case
    text = re.sub(r'\W', ' ', text)  # remove characters
    text = re.sub(r'\s+', ' ', text)  # remove spaces
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # remove stop
    text = ' '.join(words)
    return text

# apply data preprocessing
df['comment'] = df['comment'].apply(preprocess_text)

# extract feature
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['comment']).toarray()
y = df['rating']

# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# model valuate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qujianning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.8412586425850148
Classification Report:
               precision    recall  f1-score   support

          -1       0.74      0.32      0.45       594
           0       0.54      0.10      0.16       742
           1       0.85      0.99      0.92      5751

    accuracy                           0.84      7087
   macro avg       0.71      0.47      0.51      7087
weighted avg       0.81      0.84      0.80      7087



### SVM

In [5]:
from sklearn.svm import SVC

# train SVC model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Valuate model
svm_pred = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

SVM Accuracy: 0.8737124312120784
SVM Classification Report:
               precision    recall  f1-score   support

          -1       0.73      0.48      0.58       594
           0       0.60      0.35      0.44       742
           1       0.90      0.98      0.94      5751

    accuracy                           0.87      7087
   macro avg       0.74      0.61      0.66      7087
weighted avg       0.86      0.87      0.86      7087



### Regression

In [6]:
from sklearn.linear_model import LogisticRegression

# train regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

# valuation
logreg_pred = logreg_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))

Logistic Regression Accuracy: 0.870467052349372
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          -1       0.78      0.41      0.54       594
           0       0.62      0.35      0.45       742
           1       0.89      0.99      0.94      5751

    accuracy                           0.87      7087
   macro avg       0.76      0.58      0.64      7087
weighted avg       0.85      0.87      0.85      7087



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Xgboost

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

file_path = 'combined_data.csv'
df = pd.read_csv(file_path)
# df = df.sample(frac=0.5)
label_mapping = {-1: 0, 0: 1, 1: 2}
reverse_label_mapping = {0: -1, 1: 0, 2: 1}

# data clean
df.dropna(inplace=True)

# data preprocessing
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)
    text = re.sub("["
                   u"\U0001F600-\U0001F64F"  # emoticons
                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                   u"\U00002702-\U000027B0"
                   u"\U000024C2-\U0001F251"
                   "]+", '', text, flags=re.UNICODE)  # delete emoji
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    text = ' '.join(words)
    return text

# apply data preprocessing
df['comment'] = df['comment'].apply(preprocess_text)

tfidf_converter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7)
X = tfidf_converter.fit_transform(df['comment']).toarray()
y_mapped = df['rating'].map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, random_state=42)

model = xgb.XGBClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
predictions_original_labels = [reverse_label_mapping[label] for label in predictions]
# valuation
print("Accuracy:", accuracy_score(y_test.map(reverse_label_mapping), predictions_original_labels))
print("Classification Report:\n", classification_report(y_test.map(reverse_label_mapping), predictions_original_labels))

Accuracy: 0.8973240832507433
Classification Report:
               precision    recall  f1-score   support

          -1       0.95      0.90      0.92      7081
           0       0.60      0.23      0.33      1459
           1       0.89      0.98      0.93     11640

    accuracy                           0.90     20180
   macro avg       0.81      0.70      0.73     20180
weighted avg       0.89      0.90      0.88     20180



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

file_path = 'combined_data.csv'
df = pd.read_csv(file_path)
# df = df.sample(frac=0.5)
label_mapping = {-1: 0, 0: 1, 1: 2}
reverse_label_mapping = {0: -1, 1: 0, 2: 1}

# data clean
df.dropna(inplace=True)
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    # text = re.sub("...", '', text, flags=re.UNICODE)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

df['comment'] = df['comment'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qujianning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
print(len(df))
print(len(df[df['rating'] == -1]))
print(len(df[df['rating'] == 0]))
print(len(df[df['rating'] == 1]))
df.head()

67264
23799
4945
38520


Unnamed: 0,rating,comment
0,0,funny w great dialogue good reflections
1,1,great bio guy never heard seems black held bac...
2,1,cheadle hits right notes bio pic dc dj petey g...
3,0,cheadle amazing actor worht watching chiwetel ...
4,1,cheadle chiwetel ejofor movie story inspiratio...
