In [1]:
from decouple import config
from pymongo import MongoClient
import pandas as pd

In [2]:
connection_string = config('MONGO_CONNECTION_STRING')

In [3]:
client = MongoClient(connection_string)
db = client.db
coll = db['twitter']

In [4]:
pipeline = [
    {
        "$project": {
            "_id": 0,
            "username": "$user.screen_name",
            "userdesc": "$user.description",
            "verified": "$user.verified",
            "followers": "$user.followers_count",
            "created_at": {
                "$dateFromString": {
                    "dateString": "$created_at"
                }
            },
            "text": 1,
            "favorite_count": 1,
            "retweet_count": 1,
            "has_urls": {
                "$cond": [
                    {"$gte": [{"$size": "$entities.urls"}, 1]},
                    True,
                    False
                ]
            },
            "has_mentions": {
                "$cond": [
                    {"$gte": [{"$size": "$entities.user_mentions"}, 1]},
                    True,
                    False
                ]
            }
        }
    },
    {
        "$sort": {
            "favorite_count": -1
        }
    }
]

In [5]:
df = pd.DataFrame.from_dict(list(coll.aggregate(pipeline)))

In [6]:
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions
0,They are starting to get more and more despera...,24298,168648,GretaThunberg,17 year old climate and environmental activist...,True,4079169,2020-02-29 15:26:10,True,False
1,I do not believe we will defeat Donald Trump w...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for Pr...,True,10951634,2020-03-02 20:30:56,False,False
2,Indigenous rights = Climate justice\n#Wetsuwet...,4609,21488,GretaThunberg,17 year old climate and environmental activist...,True,4086646,2020-02-08 13:36:48,True,False
3,Stop running away from your problem. Run into ...,2739,16317,pulte,The Philanthropist. Inventor of Twitter Philan...,True,2059165,2020-02-29 21:19:22,True,False
4,Support the Wet’suwet’en Nation and the pipeli...,2972,10035,GretaThunberg,17 year old climate and environmental activist...,True,4091979,2020-02-18 10:13:02,True,False


In [7]:
favorite_buckets = [
{
      "$facet": {
          "favorite_level": [{
              "$bucket": {
                  "groupBy": "$favorite_count",
                  "boundaries": [0, 4, 40, 400, 1200, 1000000],
                  "default": -1,
                  "output": {
                      "count": {"$sum": 1}
                  }
              }
          }]
      }  
    }
]

In [8]:
list(coll.aggregate(favorite_buckets))
# All tweets:
# 0-3 favs: 2813 (of which 1490 have 0)
# 4-38: 984
# 40-399: 334
# 400-1199: 58
# 1200+ 30

[{'favorite_level': [{'_id': 0, 'count': 2813},
   {'_id': 4, 'count': 984},
   {'_id': 40, 'count': 334},
   {'_id': 400, 'count': 58},
   {'_id': 1200, 'count': 30}]}]

In [9]:
favorite_buckets_many_followers = [
    {
      "$facet": {
          "favorite_level": [
              {
                "$match": {
                    "user.followers_count": { "$gte": 5000 }
                }
              },
              {
              "$bucketAuto": {
                  "groupBy": "$favorite_count",
                  "buckets": 8
              }
          }]
      }
    }
]

In [10]:
list(coll.aggregate(favorite_buckets_many_followers))

[{'favorite_level': [{'_id': {'min': 0, 'max': 2}, 'count': 162},
   {'_id': {'min': 2, 'max': 5}, 'count': 125},
   {'_id': {'min': 5, 'max': 10}, 'count': 130},
   {'_id': {'min': 10, 'max': 21}, 'count': 119},
   {'_id': {'min': 21, 'max': 46}, 'count': 118},
   {'_id': {'min': 46, 'max': 124}, 'count': 119},
   {'_id': {'min': 124, 'max': 596}, 'count': 118},
   {'_id': {'min': 596, 'max': 168648}, 'count': 55}]}]

In [11]:
favorite_buckets_few_followers = [
    {
      "$facet": {
          "favorite_level": [
              {
                "$match": {
                    "user.followers_count": { "$lt": 5000 }
                }
              },
              {
              "$bucketAuto": {
                  "groupBy": "$favorite_count",
                  "buckets": 10
              }
          }]
      }
    }
]

In [12]:
list(coll.aggregate(favorite_buckets_few_followers))

[{'favorite_level': [{'_id': {'min': 0, 'max': 1}, 'count': 1406},
   {'_id': {'min': 1, 'max': 2}, 'count': 655},
   {'_id': {'min': 2, 'max': 4}, 'count': 499},
   {'_id': {'min': 4, 'max': 10}, 'count': 359},
   {'_id': {'min': 10, 'max': 255}, 'count': 327},
   {'_id': {'min': 255, 'max': 1910}, 'count': 27}]}]

In [13]:
HIGH_FOLLOWERS = 5000
HIGH_FOLLOWERS_RESPONSE_THRESHOLD = 400
LOW_FOLLOWERS_RESPONSE_THRESHOLD = 100

def is_high_response(row: int) -> bool:
    followers = row['followers']
    if followers >= HIGH_FOLLOWERS:
        if row['favorite_count'] >= HIGH_FOLLOWERS_RESPONSE_THRESHOLD:
            return True
    elif followers < HIGH_FOLLOWERS:
        if row['favorite_count'] >= LOW_FOLLOWERS_RESPONSE_THRESHOLD:
            return True
    return False

In [15]:
# Add response level column
df['high_response'] = df.apply(is_high_response, axis=1)
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions,high_response
0,They are starting to get more and more despera...,24298,168648,GretaThunberg,17 year old climate and environmental activist...,True,4079169,2020-02-29 15:26:10,True,False,True
1,I do not believe we will defeat Donald Trump w...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for Pr...,True,10951634,2020-03-02 20:30:56,False,False,True
2,Indigenous rights = Climate justice\n#Wetsuwet...,4609,21488,GretaThunberg,17 year old climate and environmental activist...,True,4086646,2020-02-08 13:36:48,True,False,True
3,Stop running away from your problem. Run into ...,2739,16317,pulte,The Philanthropist. Inventor of Twitter Philan...,True,2059165,2020-02-29 21:19:22,True,False,True
4,Support the Wet’suwet’en Nation and the pipeli...,2972,10035,GretaThunberg,17 year old climate and environmental activist...,True,4091979,2020-02-18 10:13:02,True,False,True


In [16]:
df.to_csv('aggregation.csv', index=False)