# R_2 Through EDA of 65 GB tweets on the topic of Politics

- to gauge how many political tweets there are

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("COMP90024_A2_EDA")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/23 17:06:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read In Original
spark_json = spark.read.json('../data/raw/BigTwitterFile/twitter-huge.json')

                                                                                

In [3]:
# Observe Schema
spark_json.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- doc: struct (nullable = true)
 |    |-- _id: string (nullable = true)
 |    |-- _rev: string (nullable = true)
 |    |-- data: struct (nullable = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- context_annotations: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- domain: struct (nullable = true)
 |    |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- entity: struct (nullable = true)
 |    |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |-- conversation_id: string (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |    |-- entiti

### Extract Key Columns

In [4]:
# important features to extract out of the original dataframe
out_df = spark_json.select('doc._id', 'doc.data.created_at', 'doc.data.geo.coordinates.coordinates', 
                  'doc.data.geo.coordinates.type', 'doc.includes', 
                  'doc.data.geo.place_id', 'doc.data.lang', 'doc.data.sentiment', 'doc.data.text', 
                  'doc.data.author_id')

### Missing Geolocation

In [64]:
# observe missing values in 'includes' (geolocation)


from pyspark.sql.functions import col, sum

def count_missing_values(df, column_name):
    # Filter the DataFrame to select rows where the column is null
    filtered_df = df.filter(col(column_name).isNull())
    
    # Use the `sum()` function to count the number of rows with null values
    count = filtered_df.select(sum(col(column_name).isNull().cast("int"))).collect()[0][0]
    
    return count

missing_count = count_missing_values(out_df, "includes")

missing_count

49300230

In [65]:
out_df.count()

                                                                                

52533743

In [66]:
52533743 - 49300230

3233513

### Extract Tweets containing AI Keywords

In [1]:
import pandas as pd
words_df = pd.read_excel('../data/raw/Keywords/Voting Keyword.xlsx')

words = list(words_df['keywords'])

In [7]:
# Only get tweets that contain AI related keywords

from pyspark.sql.functions import col

# Assume 'df' is your DataFrame with a string typed column named 'text'
filter_cond = col('doc.data.text').contains(words[0])  # initial filter condition

for i in range(1, len(words)):
    filter_cond = filter_cond | col('text').contains(words[i])  # add each snippet to the filter condition using the & operator

filtered_df = out_df.filter(filter_cond)  # apply the filter condition to the DataFrame

result_df = filtered_df.filter(col("doc.includes").isNotNull())

In [8]:
result_df.count()

                                                                                

319631

In [9]:
# convert to Pandas df
df = result_df.select("*").toPandas()

                                                                                

In [56]:
# number of 'real AI related' tweets and has useable geolocation
len(df[(~df['includes'].isnull())])

49952

In [2]:
# observes some of these real political related tweet

i = 0
for txt in df[(~df['includes'].isnull())]['text']:
    print(txt)
    print('==')
    i += 1

    if i == 200:
        break

In [3]:
# EDA location
df.loc[0]['includes']

'{"places":[{"full_name":"Brisbane, Queensland","geo":{"type":"Feature","bbox":[152.668522848,-27.767440994,153.31787024,-26.996844991],"properties":{}},"id":"004ec16c62325149"}]}'

# OTHER EDA (NOT IMPORTANT)

In [24]:
# check how many coordinates
coord = filtered_df['coordinates']
coord = [x for x in coord if x != None]
len(coord)

4093

In [28]:
# check how many place ids
place_id = filtered_df['place_id']
place_id = [x for x in place_id if x != None]

len(place_id)

62422

In [8]:
# check time range
from pyspark.sql.functions import min, max

# assuming your data is stored in a Spark DataFrame called 'df'
min_datetime = spark_json.agg(min("doc.data.created_at")).collect()[0][0]
max_datetime = spark_json.agg(max("doc.data.created_at")).collect()[0][0]

print("Minimum datetime:", min_datetime)
print("Maximum datetime:", max_datetime)



Minimum datetime: 2022-02-10T00:00:00.000Z
Maximum datetime: 2022-08-10T23:59:59.000Z


                                                                                