# Project Title

-description-

## Table of Content

1. Importing modules

2. Data processing

...

## 1. Importing modules

In [1]:
import csv
from zipfile import ZipFile
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

## 2. Data processing

In [2]:
#Initialize a spark session.
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [6]:
spark = init_spark()

lines = spark.sparkContext.textFile('data/influencers.txt')

# get category and username index
headers = lines.take(2)
header = headers[0]
category_index = header.split("\t").index("Category")
username_index = header.split("\t").index("Username")

# filter travel influencers
lines = lines.filter(lambda line: line not in headers)
lines = lines.map(lambda line: line.split("\t"))
travel_influencers = lines.filter(lambda line: line[category_index] == 'travel')
# get all travel influencers IG username
travel_usernames = travel_influencers.map(lambda line: line[username_index])

print(travel_influencers.count())
print(travel_usernames.count())
print(travel_usernames.collect()[3])

4210
4210
wake.up.matt


In [12]:
# TODO: reduce time complexity
with ZipFile('data/post-metadata/fix.zip', 'r') as zipObject:
    names = zipObject.namelist()
    for file_name in names:
        if any(i in file_name for i in travel_usernames.collect()):
            # Extract a travel influencers post metadata from zip
            zipObject.extract(file_name, 'data/post-metadata')
            print('All the python files are extracted')

['zip/', 'zip/test1.txt', 'zip/test2.txt', 'zip/test3.txt']
zip/test1.txt
zip/test2.txt


In [None]:
#read the files
#./data is where all the info files reside. Change the path accordingly
df = spark.read.json('./data/*.info')
rdd = df.rdd

#functions to extract data from Row rdd
def extract_location(row):
    if row is not None:
        return row['name'], row['id']
    return '', ''

def extract_hash_tags(row):
    result = []
    if row is not None:
        for edges in (row['edges'] or []):
            parts = edges['node']['text'].split()
            result.extend([p.strip() for p in parts if p.strip().startswith('#')])
    return result

def extract_count_likes(row):
    if row is None or row['count'] is None:
        return 0

    return row['count']

def extract_owner_username(row):
    if row is None or row['username'] is None:
        return ''

    return row['username']

def create_post(row):
    loc_name, loc_id = extract_location(row['location'])
    hash_tags = extract_hash_tags(row['edge_media_to_caption'])
    post_id = row['id'] or ''
    count_likes = extract_count_likes(row['edge_media_preview_like'])
    owner_username = extract_owner_username(row['owner'])
    return {
        'post_id': post_id,
        'location_name' : loc_name,
        'location_id' : loc_id,
        'hash_tags': hash_tags,
        'count_likes': count_likes,
        'owner_username': owner_username
    }

#creates the post on json format with all field needed.
rdd.map(lambda r: create_post(r)) #.collect() #add this if you want to see the data on your local