In [9]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [10]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ariha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ariha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:
df = pd.read_csv("data\\train.tsv", sep="\t", encoding="latin1")
print(df.head())

                                                text     label
0  RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...  partisan
1  VIDEO - #Obamacare:  Full of Higher Costs and ...  partisan
2  Please join me today in remembering our fallen...   neutral
3  RT @SenatorLeahy: 1st step toward Senate debat...   neutral
4  .@amazon delivery #drones show need to update ...  partisan


In [12]:
# Set of English stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, @mentions, and #hashtags
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize (split into words)
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Rejoin words into a single string
    return " ".join(filtered_tokens)

# Apply the cleaning function to the 'text' column
df['cleaned_text'] = df['text'].apply(preprocess_text)

print("\nOriginal vs. Cleaned Text:")
print(df[['text', 'cleaned_text']].head())


Original vs. Cleaned Text:
                                                text  \
0  RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...   
1  VIDEO - #Obamacare:  Full of Higher Costs and ...   
2  Please join me today in remembering our fallen...   
3  RT @SenatorLeahy: 1st step toward Senate debat...   
4  .@amazon delivery #drones show need to update ...   

                                        cleaned_text  
0                          rt rep trey radel r slams  
1            video full higher costs broken promises  
2  please join today remembering fallen heroes ho...  
3  rt st step toward senate debate leahycrapo bil...  
4  delivery show need update law promote protect ...  


In [15]:
df['cleaned_text'].to_csv("out.csv")

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['cleaned_text']
y = df['label']

vectorizer = TfidfVectorizer(max_features=5000)

X_tfidf = vectorizer.fit_transform(X)

print(f"\nShape of the TF-IDF matrix: {X_tfidf.shape}")


Shape of the TF-IDF matrix: (5000, 5000)


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=78)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.7640

Classification Report:
              precision    recall  f1-score   support

     neutral       0.77      0.96      0.86       739
    partisan       0.66      0.20      0.31       261

    accuracy                           0.76      1000
   macro avg       0.72      0.58      0.58      1000
weighted avg       0.74      0.76      0.71      1000



In [25]:
def predict_partisanship(text_message):
    cleaned_message = preprocess_text(text_message)
    
    vectorized_message = vectorizer.transform([cleaned_message])
    
    prediction = model.predict(vectorized_message)
    
    return prediction[0]

new_text_1 = "Higher costs are bad"
prediction_1 = predict_partisanship(new_text_1)
print(f"\nMessage: '{new_text_1}'")
print(f"Predicted Label: {prediction_1}")

new_text_2 = "The committee will hold a hearing tomorrow at 10 AM to discuss the new transportation bill."
prediction_2 = predict_partisanship(new_text_2)
print(f"\nMessage: '{new_text_2}'")
print(f"Predicted Label: {prediction_2}")


Message: 'Higher costs are bad'
Predicted Label: partisan

Message: 'The committee will hold a hearing tomorrow at 10 AM to discuss the new transportation bill.'
Predicted Label: neutral
