In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import joblib


In [None]:
df = pd.read_csv('train.csv')  # Make sure your file is named train.csv

# 5. Preprocessing (adjust column names if needed)
# Assume columns: 'text' and 'label'
df['text'] = df['tweet'].astype(str).str.lower().str.replace(r'[^a-z\s]', '', regex=True).str.strip()


In [None]:
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)
df_balanced = pd.concat([df_majority, df_minority_upsampled])


In [None]:

# 8. Build and train the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(random_state=42, loss='log_loss'))
])
pipeline.fit(X_train, y_train)


In [None]:
# 9. Evaluate (optional)
from sklearn.metrics import classification_report
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.93      0.94      5880
           1       0.93      0.96      0.95      6008

    accuracy                           0.94     11888
   macro avg       0.95      0.94      0.94     11888
weighted avg       0.94      0.94      0.94     11888



In [None]:
# 10. Save the model for Streamlit
joblib.dump(pipeline, 'sgd_pipeline_model.pkl')

['sgd_pipeline_model.pkl']

In [None]:
# 11. Download the model
from google.colab import files
files.download('sgd_pipeline_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>