In [None]:
import gzip
import json
import datetime

from google.colab import drive
drive.mount('/content/drive')

WORKING_DIR = 'drive/MyDrive/dtsa5800_tweets_network_analysis'
DATA_DIR = f'{WORKING_DIR}/data'
DATA_FILE = f'{DATA_DIR}/nikelululemonadidas_tweets.jsonl.gz'

Mounted at /content/drive


## Data

In [None]:
with gzip.open(DATA_FILE, 'rt') as f:
    line = f.readline()
    data = json.loads(line)
    print(json.dumps(data, indent=4))

{
    "created_at": "Fri Oct 01 08:25:03 +0000 2021",
    "id": 1443854459625431000,
    "id_str": "1443854459625431041",
    "full_text": "#ad The Nike Women's Air More Uptempo 96 'White/Opti Yellow' is now available via @footlocker! |$160| #SneakerScouts @Nike https://t.co/5lAq7b2ffU https://t.co/wmjxIcsheP",
    "truncated": false,
    "display_text_range": [
        0,
        146
    ],
    "entities": {
        "hashtags": [
            {
                "text": "ad",
                "indices": [
                    0,
                    3
                ]
            },
            {
                "text": "SneakerScouts",
                "indices": [
                    102,
                    116
                ]
            }
        ],
        "symbols": [],
        "user_mentions": [
            {
                "screen_name": "footlocker",
                "name": "Foot Locker",
                "id": 22030851,
                "id_str": "22030851",
                "ind

### date range, location counts

In [None]:
dates = []
country_counts = {}

with gzip.open(DATA_FILE, 'rt') as f:
    for line in f:
        data = json.loads(line)

        created_at = data.get("created_at", "")
        if created_at:
            date_obj = datetime.datetime.strptime(created_at, "%a %b %d %H:%M:%S %z %Y")
            dates.append(date_obj)

        country_code = (data.get('place') or {}).get('country_code', '').upper()
        if not country_code:
            country_code = "Unknown"

        if country_code in country_counts:
            country_counts[country_code] += 1
        else:
            country_counts[country_code] = 1

min_date = min(dates) if dates else None
max_date = max(dates) if dates else None

print(f"Total number of dates: {len(dates)}")
print("Range of Dates:", min_date.strftime("%Y-%m-%d") if min_date else "None", "-", max_date.strftime("%Y-%m-%d") if max_date else "None", '\n')
print("Unique location count:", len(country_counts))
print("List of locations:", list(country_counts.keys()), '\n')
print(f"Total number of data: {sum(country_counts.values())}")
for country_code, count in country_counts.items():
    print(f"{country_code}: {count}")


Total number of dates: 175078
Range of Dates: 2021-10-01 - 2022-01-01 

Unique location count: 6
List of locations: ['Unknown', 'US', 'CA', 'TR', 'MX', 'GR'] 

Total number of data: 175078
Unknown: 169351
US: 5713
CA: 10
TR: 1
MX: 2
GR: 1


### filter tweets

In [None]:
def match_tweet(tweet, country_codes=None, start_date=None, end_date=None):

    country_code = (tweet.get('place') or {}).get('country_code', '').upper()
    country_match = not country_codes or country_code in (code.upper() for code in country_codes)

    created_at = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y").date()
    date_match = (start_date or created_at) <= created_at <= (end_date or created_at)

    return country_match and date_match


def filter_tweets(tweets, country_codes=None, start_date=None, end_date=None):
    for tweet in tweets:
        if isinstance(tweet, (bytes, str)):
            tweet = json.loads(tweet)
        if match_tweet(tweet, country_codes, start_date, end_date):
            yield tweet

In [None]:
with gzip.open(DATA_FILE) as f:
  us_tweets = list(filter_tweets(f, country_codes=['us']))

len(us_tweets)

5713

In [None]:
start = datetime.date.fromisoformat('2021-12-01')
end = datetime.date.fromisoformat('2021-12-31')

dec_us_tweets = list(filter_tweets(us_tweets, start_date=start, end_date=end))
len(dec_us_tweets)

1886

### tweet text examples
nike, adidas, lululemon

In [7]:
LIMIT = 10

with gzip.open(DATA_FILE) as f:
  for i, line in enumerate(f):
    if i >= LIMIT:
      break
    data = json.loads(line)
    text = data.get('full_text') or data.get('text')
    if 'nike' or 'adidas' or 'lululemon' in text.lower():
      print(i, text)

0 #ad The Nike Women's Air More Uptempo 96 'White/Opti Yellow' is now available via @footlocker! |$160| #SneakerScouts @Nike https://t.co/5lAq7b2ffU https://t.co/wmjxIcsheP
1 @_christiankeith @d1vetsam @KicksFinder They are available too at @adidas
2 Proof @LaserShip is stealing. I work from home and have a ring doorbell. @wsoctv @Nike @wcnc @wbtv @bbb_us https://t.co/9o3stezjgs
3 RT @pyleaks: *LEAK ALERT*: The next Supreme x @Nike collab for Spring 2022 will feature the Nike Shox Ride 2.
The duo will be dropping 3 co…
4 RT @SneakerScouts: #ad The Space Jam x Nike LeBron 18 Low 'Sylvester vs. Tweety' is now available via @snipes_usa! |$160| #SneakerScouts @K…
5 Via Nike⁠ SNKRS: can I get a W ⁦@Nike⁩ ⁦@nikebasketball⁩ #snkrs  https://t.co/lQ6zKN1Oq6
6 SELENA boosted up Puma stocks by 40% 
Her partnership helped Puma grow faster
 than rivals @Adidas &amp; @Nike https://t.co/uRKsuz32lj
7 RT @etnow: We’re happier than ever as @BillieEilish teams up with @Nike to release sustainable Air Jor