## Preliminaries

In [1]:
# Imports

from dotenv import load_dotenv
import os
import praw
from praw.models import MoreComments
import matplotlib as plt
import re, pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta

In [6]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA/TWEETS/archive/nikelululemonadidas_tweets.jsonl"

## Helper functions 

In [3]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [4]:
# Function to get datetime from UTC timestamp

def dt_from_utc(ts: float) -> pd.Timestamp:

    '''
    Returns pd.datetime object (still in UTC)
    '''
    
    return pd.to_datetime(ts, unit="s", utc=True)

In [5]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

## Load and Inspect Data

In [8]:
# Load twitter data

tweets_df = pd.read_json(PATH,lines=True)

In [9]:
# Examine twitter data

examine_df('Twitter dataframe', tweets_df)



Number of records in the Twitter dataframe is: 175078


Number of features in the Twitter dataframe is: 32

The columns in the Twitter dataframe are: Index(['created_at', 'id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'entities', 'extended_entities', 'source',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'is_quote_status', 'retweet_count', 'favorite_count',
       'favorited', 'retweeted', 'possibly_sensitive', 'lang',
       'retweeted_status', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status_permalink', 'quoted_status', 'withheld_in_countries'],
      dtype='object')


 Other info about Twitter dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175078 entries, 0 to 175077
Data columns (total 32 columns):
 #   Column                     Non-Null Count   Dtype       

None


 Basic statistical info about Twitter dataframe:



Unnamed: 0,id,id_str,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,contributors,retweet_count,favorite_count,possibly_sensitive,quoted_status_id,quoted_status_id_str
count,175078.0,175078.0,65499.0,65499.0,74424.0,74424.0,0.0,175078.0,175078.0,61050.0,14337.0,14337.0
mean,1.45991e+18,1.45991e+18,1.45802e+18,1.45802e+18,2.85524e+17,2.85524e+17,,2175.394538,4.640395,0.032039,1.456705e+18,1.456705e+18
std,9343355000000000.0,9343355000000000.0,1.857045e+16,1.857045e+16,5.160189e+17,5.160189e+17,,5476.268498,361.181852,0.176106,4.094802e+16,4.094802e+16
min,1.443818e+18,1.443818e+18,5.785303e+16,5.785303e+16,12.0,12.0,,0.0,0.0,0.0,7.265985e+16,7.265985e+16
25%,1.452045e+18,1.452045e+18,1.449183e+18,1.449183e+18,256658400.0,256658400.0,,0.0,0.0,0.0,1.452712e+18,1.452712e+18
50%,1.459229e+18,1.459229e+18,1.459275e+18,1.459275e+18,415859400.0,415859400.0,,0.0,0.0,0.0,1.461536e+18,1.461536e+18
75%,1.468982e+18,1.468982e+18,1.467672e+18,1.467672e+18,4015386000.0,4015386000.0,,43.0,1.0,0.0,1.468979e+18,1.468979e+18
max,1.477172e+18,1.477172e+18,1.477171e+18,1.477171e+18,1.475984e+18,1.475984e+18,,109237.0,115514.0,1.0,1.477156e+18,1.477156e+18




Sample of records in the Twitter dataframe:


Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,extended_entities,source,in_reply_to_status_id,...,favorited,retweeted,possibly_sensitive,lang,retweeted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_in_countries
0,2021-10-01 08:25:03+00:00,1443854459625431000,1443854459625431040,#ad The Nike Women's Air More Uptempo 96 'Whit...,False,"[0, 146]","{'hashtags': [{'text': 'ad', 'indices': [0, 3]...","{'media': [{'id': 1443854458404941800, 'id_str...","<a href=""https://www.hootsuite.com"" rel=""nofol...",,...,False,False,0.0,en,,,,,,
1,2021-10-01 07:37:16+00:00,1443842433641558000,1443842433641558016,@_christiankeith @d1vetsam @KicksFinder They a...,False,"[40, 73]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/iphone"" r...",1.443836e+18,...,False,False,,en,,,,,,
2,2021-10-01 10:01:58+00:00,1443878848211439600,1443878848211439616,Proof @LaserShip is stealing. I work from home...,False,"[0, 106]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 1443878844923105300, 'id_str...","<a href=""http://twitter.com/download/iphone"" r...",,...,False,False,0.0,en,,,,,,
3,2021-10-01 07:41:45+00:00,1443843563108905000,1443843563108904960,RT @pyleaks: *LEAK ALERT*: The next Supreme x ...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/android"" ...",,...,False,False,,en,{'created_at': 'Wed Sep 29 15:40:11 +0000 2021...,,,,,
4,2021-10-01 06:22:08+00:00,1443823529296867300,1443823529296867328,RT @SneakerScouts: #ad The Space Jam x Nike Le...,False,"[0, 140]","{'hashtags': [{'text': 'ad', 'indices': [19, 2...",,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,False,False,,en,{'created_at': 'Thu Sep 30 16:10:08 +0000 2021...,,,,,


In [10]:
tweets_df['created_at'].describe()

count                                 175078
mean     2021-11-14 15:44:45.898999808+00:00
min                2021-10-01 06:00:43+00:00
25%                2021-10-23 22:51:26+00:00
50%                2021-11-12 18:39:36+00:00
75%         2021-12-09 16:33:31.500000+00:00
max                2022-01-01 06:58:15+00:00
Name: created_at, dtype: object