In [18]:
import requests
import zipfile
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob

In [19]:
DATASET_URL = "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip"


In [20]:
def download_and_extract_data():
    response = requests.get(DATASET_URL)
    z = zipfile.ZipFile(io.BytesIO(response.content))
    z.extractall()

In [21]:
def load_data():
    column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
    data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=column_names)
    data = data[['target', 'text']]
    return data

In [22]:
def preprocess_tweet(tweet):
    blob = TextBlob(tweet)
    return ' '.join(blob.words)

In [28]:
def main():
    download_and_extract_data()
    df = load_data()
    df['text'] = df['text'].apply(preprocess_tweet)

    X = df['text']
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train_vec, y_train)

    y_pred = classifier.predict(X_test_vec)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))

In [29]:
if __name__ == '__main__':
    main()

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76    159494
           4       0.75      0.79      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

Accuracy: 0.767278125
