# Library Imports

In [None]:
import numpy as np
import json

# Load Data File

In [None]:
with open('twitter.json', 'r') as f:
    twitter = json.load(f)

for key in twitter[0].keys():
    print(key)


_id
Rest ID
Is Blue Verified
Created At
Description
Favourites Count
Followers Count
Friends Count
Listed Count
Location
Name
Normal Followers Count
Pinned Tweet IDs Str
Screen Name
Statuses Count
Verified
Want Retweets
Is Identity Verified
Following List


# Select Features / Data Processing

In [None]:
# _id.$oid is what is used to connect followings
# This list of features can be edited

selected_cols = ['_id', 'Is Blue Verified', 'Followers Count', 'Friends Count', 'Listed Count', 'Statuses Count', 'Following List']

Based on the above keys, filter down our dataset for later processing.

In [None]:
twitter_filtered = []
for profile in twitter:
    profile_filtered = {key: profile[key] for key in selected_cols}
    twitter_filtered.append(profile_filtered)

for key in twitter_filtered[0].keys():
    print(key)

Rest ID
Is Blue Verified
Followers Count
Friends Count
Listed Count
Statuses Count
Following List


Helper function for getting a row for a particular ID.

In [None]:
def get_elements_by_id(dict_list, target_id):
  matching_elements = []
  for element in dict_list:
    if element["Rest ID"] == target_id:
      matching_elements.append(element)
  return matching_elements

In [None]:
print(f'There are {len(twitter_filtered)} total nodes.')

There are 15416 total nodes.


For each node, do the following:


1.   Find all the node's followers
2.   Add up the following count of all the node's followers
3.   Store the count in our dataset

Eventually, we replace this with some richer data source - i.e. data extracted from a fully-constructed social graph.

Note that we also skip if we encounter an ID that isn't in our dataset. Thus, our current iteration is potentially biased. This will be mitigated once we get more data, and can extract richer messages from it.

Also note that the `count` variable is printed out and incremented purely to show that the program is still running. This cell can take upwards of 30 minutes to run for our current 15k nodes. This efficiency can be increased once we have a static dataset and establish a nice database, as it's a one-time computation.



In [None]:
count = 0
for profile in twitter_filtered:
    followers = [follower.split(':')[1] for follower in profile['Following List']]
    nested_followers = 0

    for follower in followers:
        follower_dicts = get_elements_by_id(twitter_filtered, follower)

        # Only if there is a user found in the dataset
        if len(follower_dicts) == 1:
            nested_followers += follower_dicts[0]['Followers Count']

    profile['Nested Followers'] = nested_followers
    count += 1

    if count % 100 == 0:
        print(f'{count=}')

count=100
count=200
count=300
count=400
count=500
count=600
count=700
count=800
count=900
count=1000
count=1100
count=1200
count=1300
count=1400
count=1500
count=1600
count=1700
count=1800
count=1900
count=2000
count=2100
count=2200
count=2300
count=2400
count=2500
count=2600
count=2700
count=2800
count=2900
count=3000
count=3100
count=3200
count=3300
count=3400
count=3500
count=3600
count=3700
count=3800
count=3900
count=4000
count=4100
count=4200
count=4300
count=4400
count=4500
count=4600
count=4700
count=4800
count=4900
count=5000
count=5100
count=5200
count=5300
count=5400
count=5500
count=5600
count=5700
count=5800
count=5900
count=6000
count=6100
count=6200
count=6300
count=6400
count=6500
count=6600
count=6700
count=6800
count=6900
count=7000
count=7100
count=7200
count=7300
count=7400
count=7500
count=7600
count=7700
count=7800
count=7900
count=8000
count=8100
count=8200
count=8300
count=8400
count=8500
count=8600
count=8700
count=8800
count=8900
count=9000
count=9100
count=92

In [None]:
# Verification of data processing
print(twitter_filtered[0]['Followers Count'])
print(twitter_filtered[0]['Nested Followers'])

5239949
7255464


Since the "Following List" key is not one-dimensional data, SVD will not work on it. Thus, in our modified dataset, we can just delete it since we are done performing calculations on it.

In [None]:
# SVD wants one-D data for each cell
for profile in twitter_filtered:
    del profile['Following List']

In [None]:
# Verify all keys are appropriate
for key in twitter_filtered[0].keys():
    print(key)

Rest ID
Is Blue Verified
Followers Count
Friends Count
Listed Count
Statuses Count
Nested Followers


Convert to a pandas dataframe.

In [None]:
import pandas as pd

load_csv = True

if load_csv:
    twitter_df = pd.read_csv('twitter_df.csv')
else:
    twitter_df = pd.DataFrame(twitter_filtered)

To give the SVD some form of scale, we give people with a ton of followers a score of 1, and people with fewer than 100 followers a score of 0. Of course, this is inaccurate, but we will improve this in the future once we identify "top voices" to give a score of 1, and can hand-mark accounts who should be given scores of 0.

In [None]:
twitter_df['Score'] = np.where(twitter_df['Followers Count'] < 100, 0, np.where(twitter_df['Followers Count'] > 1000000, 1, None))

Perform a truncated SVD with one component. Replace the "score" column with the calculated component.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD

# Create a copy of the ID column
ids = twitter_df['Rest ID']

# Drop the ID column from the dataframe
twitter_df_no_ids = twitter_df.drop(columns=['Rest ID'])

# Create a SimpleImputer object with strategy='mean'
imputer = SimpleImputer(strategy='mean')

# Fit the imputer object to the dataframe without IDs
imputer.fit(twitter_df_no_ids)

# Transform the dataframe using the fitted imputer object
twitter_df_imputed = imputer.transform(twitter_df_no_ids)

# Create a TruncatedSVD object with n_components=1
svd = TruncatedSVD(n_components=1)

# Fit the SVD object to the imputed dataframe
svd.fit(twitter_df_imputed)

# Transform the imputed dataframe using the fitted SVD object
twitter_df_transformed = svd.transform(twitter_df_imputed)

# Create a new dataframe with the transformed scores
twitter_df_transformed_df = pd.DataFrame(twitter_df_transformed, columns=['Transformed_Score'])

# Add the ID column back to the dataframe
twitter_df_transformed_df['Rest ID'] = ids

# Replace the None scores with the transformed scores in the original dataframe
twitter_df['Score'] = twitter_df_transformed_df['Transformed_Score']


In [None]:
print(twitter_df)

                   Rest ID  Is Blue Verified  Followers Count  Friends Count  \
0                 95092020              True          5239949           1616   
1                 44196397              True        183260444            596   
2      1641378826537295874              True            40831            173   
3       713551239645007873              True            43685            221   
4                 23022687              True          6326943           5428   
...                    ...               ...              ...            ...   
15411  1442893938508005378             False            16430              9   
15412  1511793131884318720              True             2376           1295   
15413            232132831              True             4127            322   
15414            212962007              True            10745            718   
15415            834607710              True            63337           2178   

       Listed Count  Statuses Count  Ne

Store the data in CSV form if desired.

In [None]:
twitter_df.to_csv('twitter_df.csv', index=False)

Show the IDs with the highest and lowest scores.

In [None]:
top_10_ids = twitter_df.sort_values(by=['Score'], ascending=False)['Rest ID'].head(100).tolist()
bottom_10_ids = twitter_df.sort_values(by=['Score'], ascending=True)['Rest ID'].head(100).tolist()

print(f'10 Rest IDs with highest scores: {top_10_ids}')
print(f'10 Rest IDs with lowest scores: {bottom_10_ids}')


10 Rest IDs with highest scores: [956333277744218113, 17240448, 2190386409, 954510728181903361, 818876014390603776, 3221715698, 3180033032, 1280996358724489216, 15520427, 44196397, 44196397, 44196397, 40661455, 1109029162248036353, 959264982839058433, 829551745, 1749585078471086083, 35383754, 2346396080, 38187809, 166009659, 29547260, 14260608, 12901712, 12901712, 12901712, 61301738, 977687191202693125, 977687191202693125, 3349531, 2314443930, 18687249, 1427467304745410560, 304715166, 1062926939151888385, 939091, 108471631, 1359665613082402819, 990278348914950144, 1138120410233360384, 610183411, 77594892, 1739832116446420992, 38170974, 205622130, 104489372, 2859546873, 21668101, 22110773, 961436288095281152, 1343370552, 2864217936, 31672644, 3256996608, 1212608257983799296, 1382209054999646212, 1574863808098598912, 1510036486426660883, 1140059638575947776, 16809973, 1389913567671975937, 1577293480308654082, 1742406387815337984, 3881408656, 3024509200, 1339603952808067073, 1442995056755

Helper functions to find the names and screen names. This can definitely be made more efficient, as can a lot of the data transformations.

In [None]:
def find_screen_name_by_id(rest_id):
  for profile in twitter:
    if int(profile["Rest ID"]) == rest_id:
      return profile["Screen Name"]
  return None

def find_name_by_id(rest_id):
  for profile in twitter:
    if int(profile["Rest ID"]) == rest_id:
      return profile["Name"]
  return None

In [None]:
def find_id_by_screen_name(screen_name):
  for profile in twitter:
    if profile["Screen Name"] == screen_name:
      return int(profile["Rest ID"])
  return None

Print the top scorers and bottom scorers.

In [None]:
print('TOP SCORES')
seen = []
counter = 0
for id in top_10_ids:
    name = find_name_by_id(id)
    screen_name = find_screen_name_by_id(id)

    if name is not None and id not in seen:
        print(f"The name of the user with Rest ID {id} is {name} AKA {screen_name}")
        counter += 1

    seen.append(id)

    if counter >= 10:
        break

print('BOTTOM SCORES')
seen = []
counter = 0
for id in bottom_10_ids:
    name = find_name_by_id(id)
    screen_name = find_screen_name_by_id(id)

    if name is not None and id not in seen:
        print(f"The name of the user with Rest ID {id} is {name} AKA {screen_name}")
        counter += 1

    seen.append(id)

    if counter >= 10:
        break

TOP SCORES
The name of the user with Rest ID 956333277744218113 is Chia Network AKA chia_project
The name of the user with Rest ID 17240448 is Lauren Reidy AKA l_reids
The name of the user with Rest ID 2190386409 is Friendship Circle AKA FriendshipCir
The name of the user with Rest ID 954510728181903361 is Simons Institute for the Theory of Computing AKA SimonsInstitute
The name of the user with Rest ID 818876014390603776 is Melania Trump 45 Archived AKA FLOTUS45
The name of the user with Rest ID 3221715698 is Madison365 AKA madison_365
The name of the user with Rest ID 3180033032 is Feng Zhang AKA zhangf
The name of the user with Rest ID 1280996358724489216 is macrofying AKA macrofying
The name of the user with Rest ID 15520427 is Sajid Sadi AKA sajid_sadi
The name of the user with Rest ID 44196397 is Elon Musk AKA elonmusk
BOTTOM SCORES
The name of the user with Rest ID 1469132474475167754 is National Coalition for Food & Ag Research AKA NationalCFAR
The name of the user with Rest ID

In [None]:
def find_score_by_rest_id(rest_id, twitter_df):
  for index, row in twitter_df.iterrows():
    if row['Rest ID'] == rest_id:
      return row['Score']
  return None

# Example usage
rest_id = find_id_by_screen_name('elonmusk')
score = find_score_by_rest_id(rest_id, twitter_df)

if score is None:
  print(f"No row found with Rest ID {rest_id}")
else:
  print(f"Score for Rest ID {rest_id}: {score}")


Score for Rest ID 44196397: 172524672.01968035
