#Importing libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#Mounting Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def bad_line(x):
    print(x)
    return None

df_data = pd.read_csv('/content/drive/Othercomputers/My Laptop (1)/year4/final_project/data_science/data_set/topic_data_1.csv',  on_bad_lines=bad_line, engine='python')


df_data.dtypes

Tweet                       object
Date                        object
time                        object
Day of week                 object
Cashtags                    object
Hashtags                    object
Language                    object
Location                      bool
Mentioned_users               bool
Followers                  float64
Following                  float64
User_created_date           object
Listed_count               float64
Favourite_count            float64
Tweet_count                float64
Verified                      bool
Average_favourite_count    float64
account_age                float64
Likes                      float64
Comments                   float64
Retweets                   float64
Views                      float64
clean_tweet                 object
subjectivity               float64
polarity                   float64
sentiment                   object
topics                       int64
key_words                   object
dtype: object

In [4]:
df_data_linear = df_data.copy()

In [5]:
# Convert "Cashtags" column to boolean
df_data_linear['Cashtags'] = np.where(df_data_linear['Cashtags'].notnull(), True, False)

# Convert "Hashtags" column to boolean
df_data_linear['Hashtags'] = np.where(df_data_linear['Hashtags'].notnull(), True, False)


In [6]:
# Drop the columns we won't use for prediction
df_data_linear.drop(['Tweet', 'Date', 'Listed_count', 'Favourite_count', 'Tweet_count', 'User_created_date', 'Views'], axis=1, inplace=True)


#Cleaning the data for Linear regression


# Convert 'time' column to numerical

In [7]:
df_data_linear['time'] = pd.to_datetime(df_data_linear['time']).dt.hour

# Encode categorical variables

In [8]:
cat_cols = ['Day of week', 'Language', 'sentiment', 'topics', 'key_words', 'time']
for col in cat_cols:
    label_encoder = LabelEncoder()
    df_data_linear[col] = label_encoder.fit_transform(df_data_linear[col])


In [None]:
# Create feature matrix with one-hot encoded categorical variables and token counts
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(df_data_linear['clean_tweet'])
X_text = vectorizer.transform(df_data_linear['clean_tweet'])
cat_cols = ['Cashtags', 'Hashtags', 'Location', 'Mentioned_users', 'Verified', 'Day of week', 'Language', 'sentiment', 'topics', 'key_words', 'time']
cat_transformer = OneHotEncoder()
cat_transformer.fit(df_data_linear[cat_cols])
X_cat = cat_transformer.transform(df_data_linear[cat_cols])
X_num = StandardScaler().fit_transform(df_data_linear.select_dtypes(include=np.number))
X = np.hstack([X_cat.toarray(), X_text.toarray(), X_num])


In [None]:

# Create target vectors
y_likes = df_data_linear['Likes']
y_comments = df_data_linear['Comments']
y_retweets = df_data_linear['Retweets']


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_likes_train, y_likes_test, y_comments_train, y_comments_test, y_retweets_train, y_retweets_test = train_test_split(X, y_likes, y_comments, y_retweets, test_size=0.2, random_state=42)

In [None]:
label_encoder = LabelEncoder()
df_data_linear['Day of week'] = label_encoder.fit_transform(df_data_linear['Day of week'])
df_data_linear['Language'] = label_encoder.fit_transform(df_data_linear['Language'])
df_data_linear['sentiment'] = label_encoder.fit_transform(df_data_linear['sentiment'])
df_data_linear['topics'] = label_encoder.fit_transform(df_data_linear['topics'])
df_data_linear['key_words'] = label_encoder.fit_transform(df_data_linear['key_words'])


# create an instance of CountVectorizer with desired settings
vectorizer = CountVectorizer(stop_words='english')

# fit the vectorizer to the clean_tweet column
vectorizer.fit(df_data_linear['clean_tweet'])

# transform the clean_tweet column into a matrix of token counts
X_text = vectorizer.transform(df_data_linear['clean_tweet'])

# One-hot encode boolean variables

In [None]:
transformer = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), ['Cashtags', 'Hashtags', 'Location', 'Mentioned_users', 'Verified'])], remainder='passthrough')
df_data_linear = pd.DataFrame(transformer.fit_transform(df_data_linear))

In [None]:
# concatenate the one-hot encoded features, text features, and numerical features
X = np.concatenate((transformer.transform(df_data_linear), X_text.toarray(), df_data_linear[['Followers', 'Friends', 'time', 'Day of week', 'Language', 'sentiment']].values), axis=1)

# scale the numerical features using StandardScaler
scaler = StandardScaler()
X[:, -6:] = scaler.fit_transform(X[:, -6:])

# set the target variables (Likes, Comments, and Retweets)
y = df_data_linear[['Likes', 'Comments', 'Retweets']].values

ValueError: ignored