Step 1: Load and Explore the Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Load the dataset from CSV file
df = pd.read_csv('twitter_leaders.csv')

In [3]:
# Display the first few rows to verify the data is loaded correctly
df.head()

Unnamed: 0.1,Unnamed: 0,username,bio,profile_url,location,website
0,0,vpindia,official twitter account of the vice president...,https://twitter.com/VPIndia,"new delhi, india",https://t.co/1u25beBv9x
1,1,vp,vice president of the united states. wife to t...,https://twitter.com/VP,,https://t.co/vbwTCDuFie
2,2,madam_president,a project of @emilyslist with the goal of elec...,https://twitter.com/Madam_President,,https://t.co/woVJokZGQb
3,3,jessica_alupo,2nd female vice president of the republic of u...,https://twitter.com/jessica_alupo,"kampala, uganda",https://t.co/g327WBeX24
4,4,potus,"46th president of the united states, husband t...",https://twitter.com/POTUS,,https://t.co/MzB1JWfbJ0


In [4]:
# Check the shape of the dataset
print("Shape of the dataset:", df.shape)

Shape of the dataset: (5432, 6)


In [5]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Unnamed: 0       0
username         0
bio              0
profile_url      0
location       818
website        872
dtype: int64


In [6]:
# Display basic information about the dataset
print("\nDataset Info:")
print(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5432 entries, 0 to 5431
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   5432 non-null   int64 
 1   username     5432 non-null   object
 2   bio          5432 non-null   object
 3   profile_url  5432 non-null   object
 4   location     4614 non-null   object
 5   website      4560 non-null   object
dtypes: int64(1), object(5)
memory usage: 254.8+ KB
None


In [7]:
# Display basic statistics for numerical columns
print("\nSummary Statistics:")
print(df.describe(include='all'))


Summary Statistics:
         Unnamed: 0 username    bio                  profile_url    location  \
count   5432.000000     5432   5432                         5432        4614   
unique          NaN     4918   5315                         4918        2065   
top             NaN  vpindia  Actor  https://twitter.com/VPIndia  Toronto,ON   
freq            NaN        3     15                            3          87   
mean    2716.742084      NaN    NaN                          NaN         NaN   
std     1569.488478      NaN    NaN                          NaN         NaN   
min        0.000000      NaN    NaN                          NaN         NaN   
25%     1357.750000      NaN    NaN                          NaN         NaN   
50%     2715.500000      NaN    NaN                          NaN         NaN   
75%     4076.250000      NaN    NaN                          NaN         NaN   
max     5434.000000      NaN    NaN                          NaN         NaN   

        website  


In [8]:
# Drop the unnecessary 'Unnamed: 0' column
df = df.drop(columns=['Unnamed: 0'])

In [9]:
# Display the updated DataFrame to confirm the column is removed
print("Updated DataFrame:")
df.head()

Updated DataFrame:


Unnamed: 0,username,bio,profile_url,location,website
0,vpindia,official twitter account of the vice president...,https://twitter.com/VPIndia,"new delhi, india",https://t.co/1u25beBv9x
1,vp,vice president of the united states. wife to t...,https://twitter.com/VP,,https://t.co/vbwTCDuFie
2,madam_president,a project of @emilyslist with the goal of elec...,https://twitter.com/Madam_President,,https://t.co/woVJokZGQb
3,jessica_alupo,2nd female vice president of the republic of u...,https://twitter.com/jessica_alupo,"kampala, uganda",https://t.co/g327WBeX24
4,potus,"46th president of the united states, husband t...",https://twitter.com/POTUS,,https://t.co/MzB1JWfbJ0


In [10]:
# Fill missing values in 'location' and 'website'
df['location'].fillna('Unknown', inplace=True)
df['website'].fillna('No website', inplace=True)

In [11]:
# Confirm that there are no more missing values
print("Missing values after filling:")
df.isnull().sum()

Missing values after filling:


username       0
bio            0
profile_url    0
location       0
website        0
dtype: int64

In [12]:
import re

# Function to clean the text in the 'bio' column
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the 'bio' column
df['bio'] = df['bio'].apply(clean_text)

# Display the cleaned text
print("Cleaned 'bio' column:")
print(df['bio'].head())


Cleaned 'bio' column:
0    official twitter account of the vice president...
1    vice president of the united states wife to th...
2    a project of emilyslist with the goal of elect...
3    nd female vice president of the republic of ug...
4    th president of the united states husband to f...
Name: bio, dtype: object


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed

# Fit and transform the 'bio' column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['bio'])

# Convert the TF-IDF matrix to a DataFrame for easier analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the shape of the TF-IDF matrix and the first few rows
print("TF-IDF matrix shape:", tfidf_df.shape)
print("TF-IDF DataFrame head:")
print(tfidf_df.head())


TF-IDF matrix shape: (5432, 1000)
TF-IDF DataFrame head:
   about  academy   account  accounts  across  action  activist  actor  \
0    0.0      0.0  0.293001       0.0     0.0     0.0       0.0    0.0   
1    0.0      0.0  0.000000       0.0     0.0     0.0       0.0    0.0   
2    0.0      0.0  0.000000       0.0     0.0     0.0       0.0    0.0   
3    0.0      0.0  0.000000       0.0     0.0     0.0       0.0    0.0   
4    0.0      0.0  0.000000       0.0     0.0     0.0       0.0    0.0   

   actress  administration  ...  year  years  york  you  young  your  youth  \
0      0.0             0.0  ...   0.0    0.0   0.0  0.0    0.0   0.0    0.0   
1      0.0             0.0  ...   0.0    0.0   0.0  0.0    0.0   0.0    0.0   
2      0.0             0.0  ...   0.0    0.0   0.0  0.0    0.0   0.0    0.0   
3      0.0             0.0  ...   0.0    0.0   0.0  0.0    0.0   0.0    0.0   
4      0.0             0.0  ...   0.0    0.0   0.0  0.0    0.0   0.0    0.0   

   youtube  youtuber  y

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle

In [15]:
df['pq_score'] = np.random.rand(len(df))

In [16]:
# Split the data into features (X) and target (y)
X = tfidf_df
y = df['pq_score']

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [19]:
# Save the trained model to a pickle file
with open('twitter_leaders_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)

print("Model trained and saved successfully!")

Model trained and saved successfully!


In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [21]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [22]:
# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [23]:
# Print the evaluation results
print("Model Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Model Evaluation Metrics:
Mean Absolute Error (MAE): 0.2720
Mean Squared Error (MSE): 0.1081
R² Score: -0.3062


In [24]:
def predict_pq_score(new_bios, model_path='twitter_leaders_model.pkl'):
    # Load the saved model
    with open(model_path, 'rb') as model_file:
        loaded_model = pickle.load(model_file)
    
    # Preprocess the new bios (using the same TF-IDF vectorization as before)
    new_tfidf = tfidf_vectorizer.transform(new_bios)
    
    # Make predictions
    predictions = loaded_model.predict(new_tfidf)
    return predictions

# Example usage with new bios
new_bios = [
    "official twitter account of the vice president of india, shri jagdeep dhankhar."  
]

predicted_scores = predict_pq_score(new_bios)
print("Predicted PQ Scores:", predicted_scores)


Predicted PQ Scores: [0.67113238]


