In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [2]:
time_cut_replies=80
time_cut_posts=15

In [3]:
#
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"


In [4]:
df_replies = pd.read_pickle(file_path_replies)
df_posts = pd.read_pickle(file_path_posts)

In [5]:
# Define post and reply features
post_features = ['followers', 'favorite_count', 'retweet_count', 'verified', 'rumour', 'id', 'embeddings_avg']
reply_features = ['reply_followers', 'reply_user_id', 'reply_verified', 'time_diff', 'reply_embeddings_avg', 'id']

In [6]:
# Filter and group replies
df_replies['min_since_fst_post'] = round(
           (df_replies['time'] - df_replies['time'].min()).dt.total_seconds() / 60, 2)

grouped_replies = df_replies.groupby(['id']).agg(
            replies=('time_diff', 'count'),
            first_time_diff=('time_diff', 'first')
        ).reset_index()

In [7]:
# Merge posts and replies
df_posts = df_posts[post_features].merge(grouped_replies, on="id", how="inner")
df_posts['replies'] = df_posts['replies'].fillna(0)
df_posts['first_time_diff'] = df_posts['first_time_diff'].fillna(0)

# One-hot encoding for verified columns
df_posts['verified'] = df_posts['verified'].astype(str).replace({'True': '1', 'False': '0'}).astype(int)
df_posts = pd.concat([df_posts, pd.get_dummies(df_posts["verified"], dtype=int)], axis=1)
df_posts.drop(["verified"], axis=1, inplace=True)
df_posts.rename(columns={1: 'verified', 0: 'no_verified'}, inplace=True)

In [8]:
train, not_train = train_test_split(df_posts, test_size=0.3, random_state=42, stratify=df_posts['rumour'])
val, test = train_test_split(not_train, test_size=0.5, random_state=42, stratify=not_train['rumour'])

In [9]:
# Post features processing
post_features = train[["followers", "favorite_count", "retweet_count", "no_verified",\
                       "verified","rumour", "embeddings_avg","replies","first_time_diff"]]
scaler_posts = RobustScaler()
scaled_features =scaler_posts.fit_transform(post_features[['followers', 'favorite_count', 'retweet_count', \
                                                           'first_time_diff',"replies"]])
scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count', 'retweet_count', \
                                                           'first_time_diff',"replies"])
scaled_data['no_verified'] = np.array(train['no_verified'])
scaled_data['verified'] = np.array(train['verified'])
scaled_data['embeddings_avg'] = np.array(train['embeddings_avg'])
scaled_data['rumour'] = np.array(train['rumour'])
train_dataset = scaled_data


In [10]:
train_dataset

Unnamed: 0,followers,favorite_count,retweet_count,first_time_diff,replies,no_verified,verified,embeddings_avg,rumour
0,-0.085680,1.458015,0.528139,0.240550,0.333333,1,0,"[0.20999414314116752, 0.13815921917557716, 0.0...",0
1,0.362618,7.877863,4.316017,-0.292096,0.666667,0,1,"[0.14064250700175762, 0.0028862979263067244, -...",0
2,0.232104,0.366412,6.705628,-0.426117,1.000000,0,1,"[0.07760866896973716, 0.05833733040425512, 0.2...",1
3,-0.076887,-0.045802,0.043290,0.487973,0.333333,1,0,"[0.020305360401315347, 0.037071573947157176, 0...",0
4,23.473225,0.717557,0.714286,-0.371134,0.666667,0,1,"[-0.1835014725724856, -0.2997356637613848, 0.0...",1
...,...,...,...,...,...,...,...,...,...
1396,-0.084104,-0.488550,-0.350649,2.443299,-0.777778,1,0,"[-0.31757273025472055, -0.17581527815623718, 0...",1
1397,0.722883,9.198473,10.406926,-0.371134,0.333333,0,1,"[-0.031781600415706636, 0.0024340026080608367,...",0
1398,0.010306,1.511450,0.831169,-0.024055,1.111111,1,0,"[0.11681999691895076, 0.2433300060885293, 0.37...",0
1399,0.177065,0.198473,-0.233766,-0.271478,0.333333,0,1,"[0.18432844802737236, 0.23420833320253426, 0.3...",0


In [11]:
# Test/validation data preparation
test_val_df_replies = pd.read_pickle(file_path_replies)
test_val_df_posts = pd.read_pickle(file_path_posts)
test_val_df_posts = test_val_df_posts[~test_val_df_posts.id.isin(train.id)]
test_val_df_replies = test_val_df_replies[~test_val_df_replies.id.isin(train.id)]

In [14]:
# Define post and reply features
post_features = ['followers', 'favorite_count', 'retweet_count', 'verified', 'rumour', 'id', 'embeddings_avg']
test_val_reply_features = ['reply_followers', 'reply_user_id', 'reply_verified', 'time_diff', 'reply_embeddings_avg', 'id']

In [15]:
test_val_df_replies['min_since_fst_post'] = round((test_val_df_replies['time'] - test_val_df_replies['time'].min())\
        .dt.total_seconds() / 60,2)
        
test_val_df_replies = test_val_df_replies[test_val_reply_features][(test_val_df_replies.time_diff <= time_cut_replies)&\
           (test_val_df_replies.min_since_fst_post <= time_cut_posts)]

grouped_replies = test_val_df_replies.groupby(['id']).agg(
            replies=('time_diff', 'count'),
            first_time_diff=('time_diff', 'first')
        ).reset_index()

In [16]:
test_val_df_posts = test_val_df_posts[post_features].merge(grouped_replies, on="id", how="inner")
test_val_df_posts['replies'] = test_val_df_posts['replies'].fillna(0)
test_val_df_posts['first_time_diff'] = test_val_df_posts['first_time_diff'].fillna(0)

In [17]:
# One-hot encoding for verified columns
test_val_df_posts['verified'] = test_val_df_posts['verified'].astype(str).replace({'True': '1', 'False': '0'}).astype(int)
test_val_df_posts = pd.concat([test_val_df_posts, pd.get_dummies(test_val_df_posts["verified"], dtype=int)], axis=1)
test_val_df_posts.drop(["verified"], axis=1, inplace=True)
test_val_df_posts.rename(columns={1: 'verified', 0: 'no_verified'}, inplace=True)

In [18]:
test_val_df_posts = test_val_df_posts.merge(pd.concat([val,test])[['id']].reset_index(),on='id',how='left')
test_val_df_posts.set_index('index',drop=True,inplace=True)

In [19]:
post_features = test_val_df_posts[["followers", "favorite_count", "retweet_count", "no_verified", \
                                   "verified",'replies', "first_time_diff","embeddings_avg","rumour"]]
        
        
scaled_features = scaler_posts.transform(post_features[['followers', 'favorite_count', 'retweet_count', \
                                                        'first_time_diff','replies']])
        
        # Convert the scaled features back to a DataFrame
scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count', 'retweet_count', \
                                                     'first_time_diff','replies'])
        
scaled_data['no_verified'] = np.array(post_features['no_verified'])
scaled_data['verified'] = np.array(post_features['verified'])
scaled_data['embeddings_avg'] = np.array(post_features['embeddings_avg'])
scaled_data['rumour'] = np.array(post_features['rumour'])
test_dataset = scaled_data

In [20]:
test_dataset

Unnamed: 0,followers,favorite_count,retweet_count,first_time_diff,replies,no_verified,verified,embeddings_avg,rumour
0,0.431871,-0.51145,-0.402597,-0.164948,-0.666667,0,1,"[-0.045377860377941816, -0.20127306692302227, ...",1
1,-0.083555,-0.465649,-0.238095,3.123711,-0.333333,1,0,"[0.29239066131412983, -0.10304541109750669, -0...",1
2,-0.065156,-0.259542,0.242424,2.920962,-0.444444,1,0,"[-0.1634751863100312, -0.04001500118862499, -0...",1
3,4.820053,-0.29771,-0.363636,-0.391753,0.777778,0,1,"[-0.01919064891214172, -0.09012852932016055, -...",1
4,0.001917,-0.465649,-0.077922,-0.364261,0.222222,0,1,"[-0.20219282586784923, 0.06077094069298576, -0...",1
5,0.07907,-0.580153,-0.281385,2.388316,-0.888889,0,1,"[-0.09368875250220299, -0.08087433222681284, -...",1
6,0.058563,-0.473282,-0.233766,0.027491,-0.333333,0,1,"[-0.055836799740791324, -0.018984799832105638,...",0
7,-0.084082,-0.206107,-0.402597,0.587629,-0.333333,1,0,"[-0.18590750013078963, 0.18958520836063794, -0...",0
8,-0.080128,5.167939,6.112554,0.498282,0.888889,1,0,"[-0.24018099895593795, 0.3274898173456842, 0.0...",0
9,-0.066474,-0.160305,-0.17316,-0.233677,-0.444444,1,0,"[-0.06890936195850372, 0.1694636426188729, 0.3...",0


In [1]:
class LoadRumoursDatasetFilterNodeonTestV2:
    def __init__(self, file_path_replies, file_path_posts, time_cut_replies=80, time_cut_posts=15):
        self.file_path_replies = file_path_replies
        self.file_path_posts = file_path_posts
        self.time_cut_replies = time_cut_replies
        self.time_cut_posts = time_cut_posts
        self.scaler_posts = RobustScaler()
        
    def load_data(self):
        self.df_replies = pd.read_pickle(self.file_path_replies)
        self.df_posts = pd.read_pickle(self.file_path_posts)
        
    def process_data(self):
        post_features = ['followers', 'favorite_count', 'retweet_count', 'verified', 'rumour', 'id', 'embeddings_avg']
        reply_features = ['reply_followers', 'reply_user_id', 'reply_verified', 'time_diff', 'reply_embeddings_avg', 'id']
        
        # Compute minutes since first post
        self.df_replies['min_since_fst_post'] = round(
            (self.df_replies['time'] - self.df_replies['time'].min()).dt.total_seconds() / 60, 2)

        # Group replies
        grouped_replies = self.df_replies.groupby(['id']).agg(
            replies=('time_diff', 'count'),
            first_time_diff=('time_diff', 'first')
        ).reset_index()

        # Merge posts and replies
        self.df_posts = self.df_posts[post_features].merge(grouped_replies, on="id", how="inner")
        self.df_posts['replies'] = self.df_posts['replies'].fillna(0)
        self.df_posts['first_time_diff'] = self.df_posts['first_time_diff'].fillna(0)

        # One-hot encode 'verified' column
        self.df_posts['verified'] = self.df_posts['verified'].astype(str).replace({'True': '1', 'False': '0'}).astype(int)
        self.df_posts = pd.concat([self.df_posts, pd.get_dummies(self.df_posts["verified"], dtype=int)], axis=1)
        self.df_posts.drop(["verified"], axis=1, inplace=True)
        self.df_posts.rename(columns={1: 'verified', 0: 'no_verified'}, inplace=True)
        
        train, not_train = train_test_split(self.df_posts, test_size=0.3, random_state=42, stratify=self.df_posts['rumour'])
        val, test = train_test_split(not_train, test_size=0.5, random_state=42, stratify=not_train['rumour'])
        
        post_features = train[["followers", "favorite_count", "retweet_count", "no_verified", "verified", 
                               "rumour", "embeddings_avg", "replies", "first_time_diff"]]
        scaled_features = self.scaler_posts.fit_transform(post_features[['followers', 'favorite_count', 
                                                                         'retweet_count', 'first_time_diff', 'replies']])
        scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count', 'retweet_count', 
                                                             'first_time_diff', 'replies'])
        scaled_data['no_verified'] = np.array(train['no_verified'])
        scaled_data['verified'] = np.array(train['verified'])
        scaled_data['embeddings_avg'] = np.array(train['embeddings_avg'])
        scaled_data['rumour'] = np.array(train['rumour'])
        
        self.train_dataset = scaled_data

        test_val_df_replies = pd.read_pickle(self.file_path_replies)
        test_val_df_posts = pd.read_pickle(self.file_path_posts)
        test_val_df_posts = test_val_df_posts[~test_val_df_posts.id.isin(train.id)]
        test_val_df_replies = test_val_df_replies[~test_val_df_replies.id.isin(train.id)]

        post_features = ['followers', 'favorite_count', 'retweet_count', 'verified', 'rumour', 'id', 'embeddings_avg']
        test_val_reply_features = ['reply_followers', 'reply_user_id', 'reply_verified', 'time_diff', 'reply_embeddings_avg', 'id']

        test_val_df_replies['min_since_fst_post'] = round((test_val_df_replies['time'] - test_val_df_replies['time'].min())
                                                          .dt.total_seconds() / 60, 2)

        test_val_df_replies = test_val_df_replies[test_val_reply_features][(test_val_df_replies.time_diff <= self.time_cut_replies) &
                                                                          (test_val_df_replies.min_since_fst_post <= self.time_cut_posts)]

        grouped_replies = test_val_df_replies.groupby(['id']).agg(
            replies=('time_diff', 'count'),
            first_time_diff=('time_diff', 'first')
        ).reset_index()

        test_val_df_posts = test_val_df_posts[post_features].merge(grouped_replies, on="id", how="inner")
        test_val_df_posts['replies'] = test_val_df_posts['replies'].fillna(0)
        test_val_df_posts['first_time_diff'] = test_val_df_posts['first_time_diff'].fillna(0)

        # One-hot encoding for verified columns
        test_val_df_posts['verified'] = test_val_df_posts['verified'].astype(str).replace({'True': '1', 'False': '0'}).astype(int)
        test_val_df_posts = pd.concat([test_val_df_posts, pd.get_dummies(test_val_df_posts["verified"], dtype=int)], axis=1)
        test_val_df_posts.drop(["verified"], axis=1, inplace=True)
        test_val_df_posts.rename(columns={1: 'verified', 0: 'no_verified'}, inplace=True)

        test_val_df_posts = test_val_df_posts.merge(pd.concat([val, train])[['id']].reset_index(), on='id', how='left')
        test_val_df_posts.set_index('index', drop=True, inplace=True)

        post_features = test_val_df_posts[["followers", "favorite_count", "retweet_count", "no_verified", "verified", 
                                           'replies', "first_time_diff", "embeddings_avg", "rumour"]]

        scaled_features = self.scaler_posts.transform(post_features[['followers', 'favorite_count', 
                                                                     'retweet_count', 'first_time_diff', 'replies']])

        scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count', 'retweet_count', 
                                                             'first_time_diff', 'replies'])

        scaled_data['no_verified'] = np.array(post_features['no_verified'])
        scaled_data['verified'] = np.array(post_features['verified'])
        scaled_data['embeddings_avg'] = np.array(post_features['embeddings_avg'])
        scaled_data['rumour'] = np.array(post_features['rumour'])

        self.test_dataset = scaled_data
        
    def get_final_dataframes(self):
        return self.train_dataset, self.test_dataset

In [6]:
#
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"


dataset_generator  = LoadRumoursDatasetFilterNodeonTestV2(file_path_replies, file_path_posts,time_cut_posts=15,\
                                                          time_cut_replies=80)
dataset_generator.load_data()
dataset_generator.process_data()
train_dataset, test_dataset = dataset_generator.get_final_dataframes()

In [7]:
train_dataset

Unnamed: 0,followers,favorite_count,retweet_count,first_time_diff,replies,no_verified,verified,embeddings_avg,rumour
0,-0.085680,1.458015,0.528139,0.240550,0.333333,1,0,"[0.20999414314116752, 0.13815921917557716, 0.0...",0
1,0.362618,7.877863,4.316017,-0.292096,0.666667,0,1,"[0.14064250700175762, 0.0028862979263067244, -...",0
2,0.232104,0.366412,6.705628,-0.426117,1.000000,0,1,"[0.07760866896973716, 0.05833733040425512, 0.2...",1
3,-0.076887,-0.045802,0.043290,0.487973,0.333333,1,0,"[0.020305360401315347, 0.037071573947157176, 0...",0
4,23.473225,0.717557,0.714286,-0.371134,0.666667,0,1,"[-0.1835014725724856, -0.2997356637613848, 0.0...",1
...,...,...,...,...,...,...,...,...,...
1396,-0.084104,-0.488550,-0.350649,2.443299,-0.777778,1,0,"[-0.31757273025472055, -0.17581527815623718, 0...",1
1397,0.722883,9.198473,10.406926,-0.371134,0.333333,0,1,"[-0.031781600415706636, 0.0024340026080608367,...",0
1398,0.010306,1.511450,0.831169,-0.024055,1.111111,1,0,"[0.11681999691895076, 0.2433300060885293, 0.37...",0
1399,0.177065,0.198473,-0.233766,-0.271478,0.333333,0,1,"[0.18432844802737236, 0.23420833320253426, 0.3...",0


In [44]:
test_dataset

Unnamed: 0,followers,favorite_count,retweet_count,first_time_diff,replies,no_verified,verified,embeddings_avg,rumour
0,0.431871,-0.51145,-0.402597,-0.164948,-0.666667,0,1,"[-0.045377860377941816, -0.20127306692302227, ...",1
1,-0.083555,-0.465649,-0.238095,3.123711,-0.333333,1,0,"[0.29239066131412983, -0.10304541109750669, -0...",1
2,-0.065156,-0.259542,0.242424,2.920962,-0.444444,1,0,"[-0.1634751863100312, -0.04001500118862499, -0...",1
3,4.820053,-0.29771,-0.363636,-0.391753,0.777778,0,1,"[-0.01919064891214172, -0.09012852932016055, -...",1
4,0.001917,-0.465649,-0.077922,-0.364261,0.222222,0,1,"[-0.20219282586784923, 0.06077094069298576, -0...",1
5,0.07907,-0.580153,-0.281385,2.388316,-0.888889,0,1,"[-0.09368875250220299, -0.08087433222681284, -...",1
6,0.058563,-0.473282,-0.233766,0.027491,-0.333333,0,1,"[-0.055836799740791324, -0.018984799832105638,...",0
7,-0.084082,-0.206107,-0.402597,0.587629,-0.333333,1,0,"[-0.18590750013078963, 0.18958520836063794, -0...",0
8,-0.080128,5.167939,6.112554,0.498282,0.888889,1,0,"[-0.24018099895593795, 0.3274898173456842, 0.0...",0
9,-0.066474,-0.160305,-0.17316,-0.233677,-0.444444,1,0,"[-0.06890936195850372, 0.1694636426188729, 0.3...",0


In [2]:
class Load_Rumours_Dataset_filtering_since_first_post:
    def __init__(self, file_path_replies, file_path_posts, time_cut):
        self.file_path_replies = file_path_replies
        self.file_path_posts = file_path_posts
        self.time_cut=time_cut
        self.scaler_posts = RobustScaler()
        
    def load_data(self):
        self.df_replies = pd.read_pickle(self.file_path_replies)
        self.df_posts = pd.read_pickle(self.file_path_posts)
        
    def process_data(self):
        features = ['followers', 'favorite_count', 'retweet_count', 'verified', 'rumour', 'id', 'embeddings_avg']
        reply_features = ['reply_followers', 'reply_user_id', 'reply_verified', 'time_diff', 'reply_embeddings_avg', 'id']
        
        # Compute minutes since first post
        self.df_replies['min_since_fst_post'] = round(
            (self.df_replies['time'] - self.df_replies['time'].min()).dt.total_seconds() / 60, 2)
        
        self.df_replies['reply_min_since_fst_post'] = round(
            (self.df_replies['reply_time'] - self.df_replies['time'].min()).dt.total_seconds() / 60, 2)
        

        # Group replies
        grouped_replies = self.df_replies.groupby(['id','min_since_fst_post']).agg(
            replies=('time_diff', 'count'),
            first_time_diff=('time_diff', 'first')
        ).reset_index()

        # Merge posts and replies
        self.df_posts = self.df_posts[features].merge(grouped_replies, on="id", how="inner")
        self.df_posts['replies'] = self.df_posts['replies'].fillna(0)
        self.df_posts['first_time_diff'] = self.df_posts['first_time_diff'].fillna(0)
        self.df_posts['min_since_fst_post'] = self.df_posts['min_since_fst_post'].fillna(0)

        # One-hot encode 'verified' column
        self.df_posts['verified'] = self.df_posts['verified'].astype(str).replace({'True': '1', 'False': '0'}).astype(int)
        self.df_posts = pd.concat([self.df_posts, pd.get_dummies(self.df_posts["verified"], dtype=int)], axis=1)
        self.df_posts.drop(["verified"], axis=1, inplace=True)
        self.df_posts.rename(columns={1: 'verified', 0: 'no_verified'}, inplace=True)
        
        train, not_train = train_test_split(self.df_posts, test_size=0.3, random_state=42, stratify=self.df_posts['rumour'])
        val, test = train_test_split(not_train, test_size=0.5, random_state=42, stratify=not_train['rumour'])
        
        post_features = train[["followers", "favorite_count", "retweet_count", "no_verified", "verified", 
                               "rumour", "embeddings_avg", "replies", "first_time_diff","min_since_fst_post"]]
        
        scaled_features = self.scaler_posts.fit_transform(post_features[['followers', 'favorite_count','retweet_count', \
                                                                         'first_time_diff', 'replies','min_since_fst_post']])
        
        scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count','retweet_count', 'first_time_diff',\
                                                             'replies','min_since_fst_post'])
        scaled_data['no_verified'] = np.array(train['no_verified'])
        scaled_data['verified'] = np.array(train['verified'])
        scaled_data['embeddings_avg'] = np.array(train['embeddings_avg'])
        scaled_data['rumour'] = np.array(train['rumour'])
        
        self.train_dataset = scaled_data

        test_val_df_replies = pd.read_pickle(self.file_path_replies)
        test_val_df_posts = pd.read_pickle(self.file_path_posts)
        test_val_df_posts = test_val_df_posts[~test_val_df_posts.id.isin(train.id)]
        test_val_df_replies = test_val_df_replies[~test_val_df_replies.id.isin(train.id)]

        post_features = ['followers', 'favorite_count', 'retweet_count', 'verified', 'rumour', 'id', 'embeddings_avg']
        test_val_reply_features = ['reply_followers', 'reply_user_id', 'reply_verified', 'time_diff', \
                                   'reply_embeddings_avg', 'id']

        test_val_df_replies['min_since_fst_post'] = round((test_val_df_replies['time'] - test_val_df_replies['time'].min())
                                                          .dt.total_seconds() / 60, 2)
        
        test_val_df_replies['reply_min_since_fst_post'] = round((test_val_df_replies['reply_time']\
                            - test_val_df_replies['time'].min()).dt.total_seconds() / 60, 2)

        test_val_df_replies = test_val_df_replies[(test_val_df_replies.reply_min_since_fst_post <= self.time_cut) &
                                                (test_val_df_replies.min_since_fst_post <= self.time_cut)]

        grouped_replies = test_val_df_replies.groupby(['id','min_since_fst_post']).agg(
            replies=('time_diff', 'count'),
            first_time_diff=('time_diff', 'first')
        ).reset_index()

        test_val_df_posts = test_val_df_posts[features].merge(grouped_replies, on="id", how="inner")
        test_val_df_posts['replies'] = test_val_df_posts['replies'].fillna(0)
        test_val_df_posts['first_time_diff'] = test_val_df_posts['first_time_diff'].fillna(0)
        test_val_df_posts['min_since_fst_post'] = test_val_df_posts['min_since_fst_post'].fillna(0)

        # One-hot encoding for verified columns
        test_val_df_posts['verified'] = test_val_df_posts['verified'].astype(str)\
                                        .replace({'True': '1', 'False': '0'}).astype(int)
        test_val_df_posts = pd.concat([test_val_df_posts, pd.get_dummies(test_val_df_posts["verified"], dtype=int)], axis=1)
        test_val_df_posts.drop(["verified"], axis=1, inplace=True)
        test_val_df_posts.rename(columns={1: 'verified', 0: 'no_verified'}, inplace=True)

        test_val_df_posts = test_val_df_posts.merge(pd.concat([val, train])[['id']].reset_index(), on='id', how='left')
        test_val_df_posts.set_index('index', drop=True, inplace=True)
        
        

        post_features = test_val_df_posts[["followers", "favorite_count", "retweet_count", "no_verified", "verified", 
                                           'replies', "first_time_diff", "embeddings_avg", "rumour","min_since_fst_post"]]

        scaled_features = self.scaler_posts.transform(post_features[['followers', 'favorite_count','retweet_count',\
                                                                     'first_time_diff','replies','min_since_fst_post']])

        scaled_data = pd.DataFrame(scaled_features, columns=['followers', 'favorite_count','retweet_count',\
                                                             'first_time_diff', 'replies','min_since_fst_post'])

        scaled_data['no_verified'] = np.array(post_features['no_verified'])
        scaled_data['verified'] = np.array(post_features['verified'])
        scaled_data['embeddings_avg'] = np.array(post_features['embeddings_avg'])
        scaled_data['rumour'] = np.array(post_features['rumour'])

        self.test_dataset = scaled_data
        
    def get_final_dataframes(self):
        return self.train_dataset, self.test_dataset

In [9]:
#
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"


dataset_generator  = LoadRumoursDatasetFilterNodeonTestV3(file_path_replies, file_path_posts,time_cut=14)
dataset_generator.load_data()
dataset_generator.process_data()
train_dataset, test_dataset = dataset_generator.get_final_dataframes()

In [44]:
train_dataset

Unnamed: 0,followers,favorite_count,retweet_count,first_time_diff,replies,min_since_fst_post,no_verified,verified,embeddings_avg,rumour
0,-0.085680,1.458015,0.528139,0.240550,0.333333,-0.470152,1,0,"[0.20999414314116752, 0.13815921917557716, 0.0...",0
1,0.362618,7.877863,4.316017,-0.292096,0.666667,0.674339,0,1,"[0.14064250700175762, 0.0028862979263067244, -...",0
2,0.232104,0.366412,6.705628,-0.426117,1.000000,0.628095,0,1,"[0.07760866896973716, 0.05833733040425512, 0.2...",1
3,-0.076887,-0.045802,0.043290,0.487973,0.333333,-0.478916,1,0,"[0.020305360401315347, 0.037071573947157176, 0...",0
4,23.473225,0.717557,0.714286,-0.371134,0.666667,0.541007,0,1,"[-0.1835014725724856, -0.2997356637613848, 0.0...",1
...,...,...,...,...,...,...,...,...,...,...
1396,-0.084104,-0.488550,-0.350649,2.443299,-0.777778,-0.464416,1,0,"[-0.31757273025472055, -0.17581527815623718, 0...",1
1397,0.722883,9.198473,10.406926,-0.371134,0.333333,0.678289,0,1,"[-0.031781600415706636, 0.0024340026080608367,...",0
1398,0.010306,1.511450,0.831169,-0.024055,1.111111,-0.226038,1,0,"[0.11681999691895076, 0.2433300060885293, 0.37...",0
1399,0.177065,0.198473,-0.233766,-0.271478,0.333333,-0.458515,0,1,"[0.18432844802737236, 0.23420833320253426, 0.3...",0


In [45]:
test_dataset

Unnamed: 0,followers,favorite_count,retweet_count,first_time_diff,replies,min_since_fst_post,no_verified,verified,embeddings_avg,rumour
0,0.431871,-0.51145,-0.402597,-0.164948,-0.666667,-0.524376,0,1,"[-0.045377860377941816, -0.20127306692302227, ...",1
1,-0.083555,-0.465649,-0.238095,3.123711,-0.333333,-0.522552,1,0,"[0.29239066131412983, -0.10304541109750669, -0...",1
2,-0.065156,-0.259542,0.242424,2.920962,-0.444444,-0.521566,1,0,"[-0.1634751863100312, -0.04001500118862499, -0...",1
3,4.820053,-0.29771,-0.363636,-0.391753,0.777778,-0.521439,0,1,"[-0.01919064891214172, -0.09012852932016055, -...",1
4,0.001917,-0.465649,-0.077922,-0.364261,0.222222,-0.52035,0,1,"[-0.20219282586784923, 0.06077094069298576, -0...",1
5,0.07907,-0.580153,-0.281385,2.388316,-0.888889,-0.519244,0,1,"[-0.09368875250220299, -0.08087433222681284, -...",1
6,-0.066443,-0.137405,-0.138528,-0.367698,0.0,-0.517118,1,0,"[-0.12754214421979018, 0.14119799940713815, 0....",1
7,1.342326,-0.541985,-0.402597,-0.099656,-0.666667,-0.517118,0,1,"[-0.0888463114388287, -0.08276199921965599, 0....",1
8,-0.083523,-0.541985,-0.333333,-0.065292,-0.333333,-0.516835,0,1,"[0.13196508524318537, 0.17555500070254007, -0....",1
9,-0.049821,-0.19084,0.121212,-0.343643,0.111111,-0.51495,1,0,"[-0.11424199802180131, 0.028796917758882046, 0...",1
