In [6]:
import json
import pandas as pd
from pathlib import Path
from textblob import TextBlob

In [2]:
data = []
with open("posts.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))


In [3]:
df = pd.DataFrame(data)
df["text_length"] = df["text"].apply(len)
df["sentiment"] = df["text"].apply(lambda x:TextBlob(x).sentiment.polarity)
df["comment_ratio"] = df["comment_count"] / (df["vote_total"] + 1)
df["controversial"] = (df["comment_count"] >= 3).astype(int)
df["created_at"] = pd.to_datetime(df["created_at"])
df["created_hour"] = df["created_at"].dt.hour
df["created_day"] = df["created_at"].dt.date
df.to_csv("processed_simple.csv", index=False)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Select features
features = ["text_length", "sentiment", "comment_ratio", "created_hour"]
X = df[features]
y = df["controversial"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8549019607843137
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       205
           1       0.88      0.30      0.45        50

    accuracy                           0.85       255
   macro avg       0.87      0.65      0.68       255
weighted avg       0.86      0.85      0.82       255

Confusion Matrix:
 [[203   2]
 [ 35  15]]
