In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import vstack, save_npz
import joblib
import warnings

# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load datasets
train_data = pd.read_csv('/content/drive/MyDrive/cs506midterm/train.csv')
print(f"Original number of rows in train data: {len(train_data)}")

# Handling missing values in 'Text' and 'Summary'
train_data['Text'].fillna('', inplace=True)
train_data['Summary'].fillna('', inplace=True)

# Feature Engineering
train_data['review_length'] = train_data['Text'].apply(len)
train_data['summary_length'] = train_data['Summary'].apply(len)
train_data['helpfulness_ratio'] = train_data['HelpfulnessNumerator'] / (train_data['HelpfulnessDenominator'] + 1)

# Save non-text features before filtering out NaN scores
non_text_features = train_data[['review_length', 'summary_length', 'helpfulness_ratio']]

# Initialize TF-IDF vectorizer and process in chunks
tfidf = TfidfVectorizer(max_features=5000)
chunk_size = 10000
text_list = train_data['Text'].tolist()
text_features = []

# Fit and transform TF-IDF in chunks to avoid memory issues
for i in range(0, len(text_list), chunk_size):
    chunk = text_list[i:i + chunk_size]
    tfidf.fit(chunk)
    print(f"Processed chunk {i // chunk_size + 1} of {len(text_list) // chunk_size + 1}")

    # Transform and save chunk
    chunk_features = tfidf.transform(chunk)
    text_features.append(chunk_features)

# Combine all TF-IDF features into a single sparse matrix
text_features = vstack(text_features)
print("TF-IDF Transformation complete!")

# Save TF-IDF features and model
save_npz('/content/drive/MyDrive/cs506midterm/text_features_sparse.npz', text_features)
non_text_features.to_csv('/content/drive/MyDrive/cs506midterm/non_text_features.csv', index=False)
joblib.dump(tfidf, '/content/drive/MyDrive/cs506midterm/tfidf_model.pkl')

print("Preprocessing complete! Sparse text features saved to 'text_features_sparse.npz', non-text features to 'non_text_features.csv', and TF-IDF model saved to 'tfidf_model.pkl'.")


Original number of rows in train data: 1697533
Processed chunk 1 of 170
Processed chunk 2 of 170
Processed chunk 3 of 170
Processed chunk 4 of 170
Processed chunk 5 of 170
Processed chunk 6 of 170
Processed chunk 7 of 170
Processed chunk 8 of 170
Processed chunk 9 of 170
Processed chunk 10 of 170
Processed chunk 11 of 170
Processed chunk 12 of 170
Processed chunk 13 of 170
Processed chunk 14 of 170
Processed chunk 15 of 170
Processed chunk 16 of 170
Processed chunk 17 of 170
Processed chunk 18 of 170
Processed chunk 19 of 170
Processed chunk 20 of 170
Processed chunk 21 of 170
Processed chunk 22 of 170
Processed chunk 23 of 170
Processed chunk 24 of 170
Processed chunk 25 of 170
Processed chunk 26 of 170
Processed chunk 27 of 170
Processed chunk 28 of 170
Processed chunk 29 of 170
Processed chunk 30 of 170
Processed chunk 31 of 170
Processed chunk 32 of 170
Processed chunk 33 of 170
Processed chunk 34 of 170
Processed chunk 35 of 170
Processed chunk 36 of 170
Processed chunk 37 of 170


In [None]:
from scipy.sparse import load_npz, hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import pandas as pd

# Load the sparse TF-IDF features and non-text features
text_features = load_npz('/content/drive/MyDrive/cs506midterm/text_features_sparse.npz')
non_text_features = pd.read_csv('/content/drive/MyDrive/cs506midterm/non_text_features.csv')

# Load and clean target variable
train_data = pd.read_csv('/content/drive/MyDrive/cs506midterm/train.csv')
train_data = train_data.dropna(subset=['Score']).reset_index(drop=True)  # Remove rows with NaN in 'Score'
y = train_data['Score']

# Filter non-text features and text features to match the length of cleaned target data
non_text_features = non_text_features.loc[train_data.index].reset_index(drop=True)
text_features = text_features[train_data.index, :]

# Concatenate the features
X = hstack([text_features, non_text_features])

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
print("Random Forest Classifier:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_rf)}")
print(classification_report(y_val, y_pred_rf))

# Support Vector Machine Classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_val)
print("Support Vector Machine Classifier:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_svm)}")
print(classification_report(y_val, y_pred_svm))

# Save the best model
joblib.dump(rf_model, '/content/drive/MyDrive/cs506midterm/best_model.pkl')
print("Best model saved as 'best_model.pkl'.")


In [None]:
import pandas as pd
from scipy.sparse import load_npz, hstack
import joblib

# Load the test data and TF-IDF model
test_data = pd.read_csv('/content/drive/MyDrive/cs506midterm/test.csv')
tfidf = joblib.load('/content/tfidf_model.pkl')

# Apply feature engineering to the test set
test_data['Text'].fillna('', inplace=True)
test_data['review_length'] = test_data['Text'].apply(len)
test_data['summary_length'] = test_data['Summary'].apply(len)
test_data['helpfulness_ratio'] = test_data['HelpfulnessNumerator'] / (test_data['HelpfulnessDenominator'] + 1)

# Transform text data using the TF-IDF model
text_features_test = tfidf.transform(test_data['Text'])
non_text_features_test = test_data[['review_length', 'summary_length', 'helpfulness_ratio']]

# Combine sparse text features and non-text features
X_test = hstack([text_features_test, non_text_features_test])

# Load the trained model and predict
best_model = joblib.load('best_model.pkl')
test_predictions = best_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({'Id': test_data['Id'], 'Score': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")

In [None]:
files.download('submission.csv')