<a href="https://colab.research.google.com/github/erdemkarakoylu/fake_news_retweet/blob/main/01_extract_user_and_tweet_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preamble

In this notebook I extract users, tweets, and retweets. I load these in their respective dataframe. I then store each of these dataframes as a table in a MySql database.

In [1]:
from datetime import datetime
import json
from pathlib import Path
from typing import Generator, List

import numpy as np
import pandas as pd
import sqlite3

from google.colab import drive

In [2]:
drive.mount('/content/drive')
files_location = "/content/drive/Shareddrives/IAC2 Project Execution/INCAS/INCAS Data/Phase 2  Data/Erdem Files/tweets"
project_location = Path("/content/drive/MyDrive/FakeNewsProject/GNNs")
tweet_folder = Path(files_location)
assert tweet_folder.exists()

Mounted at /content/drive


In [3]:
USER_FEATURES_TO_EXTRACT = [
	  'id', 'name', 'description', 'verified', 'geo_enabled', 'followers_count',
	  'friends_count', 'statuses_count', 'favourites_count', 'listed_count',
	  'created_at',
	  ]
# This is here as a reference for what to extract
def hand_feature(user_dict):

	feature = np.zeros([len(user_dict), 10], dtype=np.float32)
	id_counter = 0
	est_date = datetime.fromisoformat('2006-03-21')
	for profile in user_dict.values():
		# 1) Verified?, 2) Enable geo-spatial positioning, 3) Followers count, 4) Friends count
		vector = [int(profile['verified']), int(profile['geo_enabled']), profile['followers_count'], profile['friends_count']]
		# 5) Status count, 6) Favorite count, 7) Number of lists
		vector += [profile['statuses_count'], profile['favourites_count'], profile['listed_count']]

		# 8) Created time (No. of months since Twitter established)
		user_date = datetime.strptime(profile['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
		month_diff = (user_date.year - est_date.year) * 12 + user_date.month - est_date.month
		vector += [month_diff]

		# 9) Number of words in the description, 10) Number of words in the screen name
		vector += [len(profile['name'].split()), len(profile['description'].split())]

		feature[id_counter, :] = np.reshape(vector, (1, 10))
		id_counter += 1
		print(id_counter)

	return feature

In [4]:
def determine_tweet_type(tweet):
    """Return tweet type for further decision making."""
    # Check for reply indicator first
    if tweet["in_reply_to_status_id"] is not None:
        tweet_type = "Reply Tweet"
    # Check boolean quote status field and make sure it's not a RT of a Quote Tweet
    elif tweet["is_quote_status"] is True:
        tweet_type = "Quote Tweet"
    # Check both indicators of a Retweet
    elif tweet["text"].startswith("RT") and tweet.get("retweeted_status") is not None:
        tweet_type = "Retweet"
    else:
        tweet_type = "Source Tweet"
    return tweet_type

In [6]:
np.zeros(10, dtype=np.float32)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [13]:
def make_feature_from_user_profile(profile:dict)->np.array:
    feature = np.zeros(10, dtype=np.float32)
    est_date = datetime.fromisoformat('2006-03-21')
    # 1) Verified?, 2) Enable geo-spatial positioning, 3) Followers count, 4) Friends count
    vector = [int(profile['verified']), int(profile['geo_enabled']), profile['followers_count'], profile['friends_count']]
	# 5) Status count, 6) Favorite count, 7) Number of lists
    vector += [profile['statuses_count'], profile['favourites_count'], profile['listed_count']]
    # 8) Created time (No. of months since Twitter established)
    user_date = datetime.strptime(profile['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
    month_diff = (user_date.year - est_date.year) * 12 + user_date.month - est_date.month
    vector += [month_diff]
    # 9) Number of words in the description, 10) Number of words in the screen name
    vector += [len(profile['name'].split()), len(profile['description'].split())]
    #feature = np.reshape(vector, (1, 10))
    return vector

In [8]:
USER_KEY_MAP = {
    'user_id': 'id', 'name': 'name', 'description': 'description',
    'created_at': 'created_at', 'verified': 'verified',
    'geo_enabled': 'geo_enabled', 'followers_count': 'followers_count',
    'friends_count': 'friends_count', 'statuses_count': 'statuses_count',
    'listed_count': 'listed_count'
    }
RETWEET_KEYS = ['user_id', 'retweet_id', 'created_at', 'tweet_id', 'source_user_id']
TWEET_KEYS = ['tweet_id', 'created_at', 'user_id', 'text', 'retweet_count', 'favorite_count']

def extract_tweet_data(tweet:dict)->dict:
    """Extracts tweet id, author id, timestamp, text, url if any."""
    tweet_payload = dict.fromkeys(TWEET_KEYS)
    tweet_payload['tweet_id'] = tweet['id']
    tweet_payload['created_at']= tweet['created_at']
    tweet_payload['user_id'] = tweet['user']['id']
    tweet_payload['text'] = tweet['text']
    tweet_payload['retweet_count'] = tweet['retweet_count']
    tweet_payload['favorite_count'] = tweet['favorite_count']
    return tweet_payload

def extract_user_data(tweet:dict)->dict:
    """
    Extracts some of the user profile data
    Input: tweet payload -> Dict
    Output: user payload -> Dict
    """
    user_payload = {key: tweet['user'][val] for key,val in USER_KEY_MAP.items()}
    # The below is to decide whether to update an already existing user entry.
    user_payload['recorded_at'] = tweet['created_at']
    user_payload['engineered_features'] = make_feature_from_user_profile(tweet['user'])
    return user_payload

def extract_retweet_data(tweet:dict):
    """Extracts retweeting user id, status id, original status id"""
    retweet_payload = dict.fromkeys(RETWEET_KEYS)
    retweet_payload['user_id'] = tweet['user']['id']
    retweet_payload['retweet_id'] = tweet['id']
    retweet_payload['created_at'] = tweet['created_at']
    retweet_payload['tweet_id'] = tweet['retweeted_status']['id']
    retweet_payload['source_user_id'] = tweet['retweeted_status']['user']['id']
    return retweet_payload

def manage_retweet_extraction(tweet:dict)->dict:
    """
    Gets data to reconstitute retweet graph.
    Calls on functions to extract original tweet, original author data,
    retweeting author data and retweet data
    """
    retweet_payload = extract_retweet_data(tweet)
    retweet_user_payload = extract_user_data(tweet)
    source_tweet_payload = extract_tweet_data(tweet['retweeted_status'])
    source_user_payload = extract_user_data(tweet['retweeted_status'])
    return dict(
        retweet_payload=retweet_payload,
        retweet_user_payload=retweet_user_payload,
        source_tweet_payload=source_tweet_payload,
        source_user_payload=source_user_payload)

def manage_tweet_extraction(tweet:dict)->dict:
    """
    Gets data to document a post. Calls on functiont to extract tweet and
    author data.
    """
    tweet_payload = extract_tweet_data(tweet)
    user_payload = extract_user_data(tweet)
    return dict(
        tweet_payload=tweet_payload, user_payload=user_payload)

def manage_extraction(tweet_collection:List|Generator)->tuple:
    """
    Creates and populates posts, users and retweet dataframes.
    """
    retweet_rows, user_rows, tweet_rows = [], [], []
    for i, tweet in enumerate(tweet_collection):
        tweet_type = determine_tweet_type(tweet)
        if tweet_type == 'Retweet':
            payloads = manage_retweet_extraction(tweet)
            # need to append payloads to d_posts(x1), d_users(x2), d_retweets(x1)
            retweet_rows.append(payloads['retweet_payload'])
            tweet_rows.append(payloads['source_tweet_payload'])
            user_rows.append(payloads['retweet_user_payload'])
            user_rows.append(payloads['source_user_payload'])

        elif tweet_type == "Source Tweet":
            payloads = manage_tweet_extraction(tweet)
            tweet_rows.append(payloads['tweet_payload'])
            user_rows.append(payloads['user_payload'])
        else: continue
    d_tweets = pd.DataFrame.from_dict(tweet_rows)
    d_users = pd.DataFrame.from_dict(user_rows)
    if retweet_rows:
        d_retweets = pd.DataFrame.from_dict(retweet_rows)
    return d_tweets, d_users, d_retweets

In [23]:
def test_extract_tweet_data(sample_tweet:dict):
    expected_payload = {
        'tweet_id': 1656373983795728407,
        'created_at': "Wed May 10 19:01:46 +0000 2023",
        'text': "Levallois : Bruno le Maire obligé de rentrer par les sous-sols pour assister à une réunion avec le patron de l’Oréa… https://t.co/nFSJRwfW65",
        'user_id': 1170759331719127041,
        'retweet_count':  1507,
        'favorite_count': 2574
    }
    current_payload = extract_tweet_data(sample_tweet['retweeted_status'])
    assert current_payload == expected_payload

def test_extract_user_data(sample_tweet:dict):
    expected_payload = {
        'user_id': 335979100,
        'created_at': "Fri Jul 15 15:13:58 +0000 2011",
        'name': "olanno",
        'description': "",
        'verified': False,
        'geo_enabled': False,
        'followers_count': 939,
        'friends_count': 1712,
        'statuses_count': 67078,
        'recorded_at': "Fri May 12 09:50:42 +0000 2023",
        'listed_count': 3,
        'engineered_features': [0, 0, 939, 1712, 67078, 36727, 3, 64, 1, 0]
    }
    current_payload = extract_user_data(sample_tweet)
    assert expected_payload == current_payload

def test_extract_retweet_data(sample_tweet:dict):
    expected_payload = {
        'retweet_id': 1656960078145101824,
        'created_at': "Fri May 12 09:50:42 +0000 2023",
        'user_id': 335979100, # this is the retweeter (retweet's author)
        'tweet_id': 1656373983795728407, # original tweet id
        'source_user_id': 1170759331719127041, # original tweet author id
    }
    current_payload = extract_retweet_data(sample_tweet)
    assert current_payload == expected_payload

In [24]:
with open(project_location / 'sample_retweet.json') as f:
    tw = json.load(f)
curr_payl = extract_user_data(tw)
curr_payl

{'user_id': 335979100,
 'name': 'olanno',
 'description': '',
 'created_at': 'Fri Jul 15 15:13:58 +0000 2011',
 'verified': False,
 'geo_enabled': False,
 'followers_count': 939,
 'friends_count': 1712,
 'statuses_count': 67078,
 'listed_count': 3,
 'recorded_at': 'Fri May 12 09:50:42 +0000 2023',
 'engineered_features': [0, 0, 939, 1712, 67078, 36727, 3, 64, 1, 0]}

In [30]:
def run_tests():
    with open(project_location / 'sample_retweet.json') as f:
        tw = json.load(f)
    test_extract_tweet_data(tw)
    test_extract_user_data(tw)
    test_extract_retweet_data(tw)
run_tests()

In [None]:
files = tweet_folder.glob('*.json')
file0 = next(files)
with open(file0) as f0:
    tweets = json.load(f0)

In [None]:
dtw, du, drt = manage_extraction(tweets)

In [None]:
dtw.head()

Unnamed: 0,tweet_id,created_at,user_id,text,retweet_count,favorite_count
0,1656373983795728407,Wed May 10 19:01:46 +0000 2023,1170759331719127041,Levallois : Bruno le Maire obligé de rentrer p...,1507,2574
1,1656253139169705986,Wed May 10 11:01:34 +0000 2023,845399351929294854,#casserolades en vue :\nELIZABETH BORNE EST AT...,474,617
2,1656211220611883008,Wed May 10 08:15:00 +0000 2023,902982970558083076,Il n’y a AUCUNE équivalence possible entre ext...,1024,2391
3,1656322227183255553,Wed May 10 15:36:06 +0000 2023,2383844606,"Attaques en véhicule, à l’arme blanche, fusill...",505,553
4,1655991651394027521,Tue May 09 17:42:31 +0000 2023,80820758,Au #Sénégal le chef de l'opposition @SonkoOffi...,1627,4502


In [None]:
dtw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2622 entries, 0 to 2621
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        2622 non-null   int64 
 1   created_at      2622 non-null   object
 2   user_id         2622 non-null   int64 
 3   text            2622 non-null   object
 4   retweet_count   2622 non-null   int64 
 5   favorite_count  2622 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 123.0+ KB


In [None]:
dtw.duplicated().sum()

0

In [None]:
du.head()

Unnamed: 0,user_id,name,description,created_at,verified,geo_enabled,followers_count,friends_count,statuses_count,recorded_at
0,335979100,olanno,,Fri Jul 15 15:13:58 +0000 2011,False,False,939,1712,67078,Fri May 12 09:50:42 +0000 2023
1,1170759331719127041,Réalité Actuelle,,Sun Sep 08 18:02:38 +0000 2019,False,False,15717,9345,6821,Wed May 10 19:01:46 +0000 2023
2,335979100,olanno,,Fri Jul 15 15:13:58 +0000 2011,False,False,939,1712,67078,Wed May 10 17:40:03 +0000 2023
3,845399351929294854,Vilain Cégétiste,"Bordélisateur CGT chez Tuifrance, militant syn...",Fri Mar 24 22:18:07 +0000 2017,False,False,4513,2035,14276,Wed May 10 11:01:34 +0000 2023
4,335979100,olanno,,Fri Jul 15 15:13:58 +0000 2011,False,False,939,1712,67078,Wed May 10 17:39:45 +0000 2023


In [None]:
du.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5225 entries, 0 to 5224
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          5225 non-null   int64 
 1   name             5225 non-null   object
 2   description      5225 non-null   object
 3   created_at       5225 non-null   object
 4   verified         5225 non-null   bool  
 5   geo_enabled      5225 non-null   bool  
 6   followers_count  5225 non-null   int64 
 7   friends_count    5225 non-null   int64 
 8   statuses_count   5225 non-null   int64 
 9   recorded_at      5225 non-null   object
dtypes: bool(2), int64(4), object(4)
memory usage: 336.9+ KB


In [None]:
du.drop_duplicates(subset=USER_KEY_MAP, keep='first').head()

Unnamed: 0,user_id,name,description,created_at,verified,geo_enabled,followers_count,friends_count,statuses_count,recorded_at
0,335979100,olanno,,Fri Jul 15 15:13:58 +0000 2011,False,False,939,1712,67078,Fri May 12 09:50:42 +0000 2023
1,1170759331719127041,Réalité Actuelle,,Sun Sep 08 18:02:38 +0000 2019,False,False,15717,9345,6821,Wed May 10 19:01:46 +0000 2023
3,845399351929294854,Vilain Cégétiste,"Bordélisateur CGT chez Tuifrance, militant syn...",Fri Mar 24 22:18:07 +0000 2017,False,False,4513,2035,14276,Wed May 10 11:01:34 +0000 2023
5,902982970558083076,Mathieu Slama,"Essayiste, auteur « Adieu la liberté » aux @Pr...",Wed Aug 30 19:54:51 +0000 2017,False,False,40325,1409,7621,Wed May 10 08:15:00 +0000 2023
7,2383844606,Thomas Portes,Cheminot - Député LFI-NUPES - CGT 3ème circo d...,Tue Mar 11 14:58:13 +0000 2014,False,True,88767,7343,41836,Wed May 10 15:36:06 +0000 2023


In [None]:
drt.head()


Unnamed: 0,user_id,retweet_id,created_at,tweet_id,source_user_id
0,335979100,1656960078145101824,Fri May 12 09:50:42 +0000 2023,1656373983795728407,1170759331719127041
1,335979100,1656353419328647169,Wed May 10 17:40:03 +0000 2023,1656253139169705986,845399351929294854
2,335979100,1656353342686130181,Wed May 10 17:39:45 +0000 2023,1656211220611883008,902982970558083076
3,335979100,1656353289364021249,Wed May 10 17:39:32 +0000 2023,1656322227183255553,2383844606
4,335979100,1656353137706369026,Wed May 10 17:38:56 +0000 2023,1655991651394027521,80820758


In [None]:
drt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         2603 non-null   int64 
 1   retweet_id      2603 non-null   int64 
 2   created_at      2603 non-null   object
 3   tweet_id        2603 non-null   int64 
 4   source_user_id  2603 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 101.8+ KB


In [None]:
drt.duplicated().sum()

0

TODO#1: Remove duplicates from users (subsetted to USER_KEY_MAP)
TODO#2: Inject dataframes into sqlite tables
TODO#3: Redo for all files; check that sqlite doesn't blow out
TODO#4: Download saved database evaluate size and manipulability (do I need a different RDBMS?)
TODO#5: Transfer db to Mindflayer.

### Run tests

#### Tweet (post) data

In [None]:
tw.keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])

In [None]:
tw['id']

1656960078145101824

In [None]:
tw['created_at']

'Fri May 12 09:50:42 +0000 2023'

In [None]:
tw['text']

'RT @ReaActuelle: Levallois : Bruno le Maire obligé de rentrer par les sous-sols pour assister à une réunion avec le patron de l’Oréal pour…'

In [None]:
tw['retweet_count']

1507

In [None]:
tw['favorite_count']

0

#### User data

In [None]:
tw['user'].keys()

dict_keys(['id', 'id_str', 'name', 'screen_name', 'location', 'description', 'url', 'entities', 'protected', 'followers_count', 'friends_count', 'listed_count', 'created_at', 'favourites_count', 'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', 'contributors_enabled', 'is_translator', 'is_translation_enabled', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_background_tile', 'profile_image_url', 'profile_image_url_https', 'profile_link_color', 'profile_sidebar_border_color', 'profile_sidebar_fill_color', 'profile_text_color', 'profile_use_background_image', 'has_extended_profile', 'default_profile', 'default_profile_image', 'following', 'follow_request_sent', 'notifications', 'translator_type', 'withheld_in_countries'])

In [None]:
user_keys_ = ['id', 'name', 'description', 'created_at', 'verified', 'geo_enabled', 'followers_count', 'friends_count', 'statuses_count']
for key in user_keys_:
    print(f"{key}: {tw['user'][key]}")

id: 335979100
name: olanno
description: 
created_at: Fri Jul 15 15:13:58 +0000 2011
verified: False
geo_enabled: False
followers_count: 939
friends_count: 1712
statuses_count: 67078


#### Source tweet from retweet

In [None]:
tw['retweeted_status'].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang'])

In [None]:
tw['retweeted_status']['id']

1656373983795728407

In [None]:
tw['retweeted_status']['created_at']

'Wed May 10 19:01:46 +0000 2023'

In [None]:
tw['retweeted_status']['user']['id']

1170759331719127041

In [None]:
print(tw['retweeted_status']['text'])

Levallois : Bruno le Maire obligé de rentrer par les sous-sols pour assister à une réunion avec le patron de l’Oréa… https://t.co/nFSJRwfW65


In [None]:
user1 = tw['user']
user2 = tw['retweeted_status']['user']