# R_9 Twitter Pipeline

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("COMP90024_A2_EDA")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/02 13:32:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read In Original
spark_json = spark.read.json('../data/raw/BigTwitterFile/twitter-huge.json')

                                                                                

### Extract Key Columns

In [3]:
# important features to extract out of the original dataframe
df_selected_columns = spark_json.select('doc._id', 'doc.data.created_at', 'doc.includes', 
                     'doc.data.lang', 'doc.data.text', 'doc.data.author_id')

### Missing Geolocation

In [4]:
from pyspark.sql.functions import col

# Assume `df` is the dataframe that contains the data
column_name = 'includes'
df_drop_no_geo = df_selected_columns.filter(col(column_name).isNotNull())

In [5]:
df_drop_no_geo.show(5)

                                                                                

+-------------------+--------------------+--------------------+----+--------------------+------------------+
|                _id|          created_at|            includes|lang|                text|         author_id|
+-------------------+--------------------+--------------------+----+--------------------+------------------+
|1491734461951909890|2022-02-10T11:23:...|{"places":[{"full...|  hi|@AshramGzb @Ashra...|858950980989140993|
|1491734528779763719|2022-02-10T11:23:...|{"places":[{"full...|  hi|@AshramGzb @naren...|858950980989140993|
|1491567527322808321|2022-02-10T00:19:...|{"places":[{"full...|  en|My life is hittin...|          45472006|
|1491693811663515654|2022-02-10T08:41:...|{"places":[{"full...|  en|@TobyRayEnglish @...|952342256823943168|
|1491674087378219009|2022-02-10T07:23:...|{"places":[{"full...|  en|@JadeArchaeobot @...|          25033901|
+-------------------+--------------------+--------------------+----+--------------------+------------------+
only showing top 5 

In [6]:
column_name_2 = 'lang'
string_value = 'en'
df_drop_no_geo_eng = df_drop_no_geo.filter(col(column_name_2) == string_value)

In [7]:
df_drop_no_geo_eng.show(1)

+-------------------+--------------------+--------------------+----+--------------------+---------+
|                _id|          created_at|            includes|lang|                text|author_id|
+-------------------+--------------------+--------------------+----+--------------------+---------+
|1491567527322808321|2022-02-10T00:19:...|{"places":[{"full...|  en|My life is hittin...| 45472006|
+-------------------+--------------------+--------------------+----+--------------------+---------+
only showing top 1 row



### Extract Tweets containing AI Keywords

In [8]:
import pandas as pd
words_df = pd.read_excel('../data/raw/Keywords/Voting Keyword.xlsx')

words = list(words_df['keywords'])
words

['accountab',
 'advoca',
 'affirmative action',
 'agenda setting',
 'aspiration',
 'asylum seeker',
 'bureaucra',
 'business',
 'citizen',
 'class',
 'colon',
 'committee',
 'communit',
 'conservati',
 'constitution',
 'convention',
 'corrupt',
 'crisis',
 'cultur',
 'culture wars',
 'federalism',
 'democracy',
 'differen',
 'discourse',
 'economic rationalis',
 'egalitarian',
 'elit',
 'environment',
 'equal',
 'ethic',
 'ethinic',
 'executive',
 'famili',
 'feminis',
 'free',
 'gender',
 'globalis',
 'green',
 'identity',
 'ideolog',
 'indigenous',
 'individual',
 'interest group',
 'international relations',
 'internet',
 'judicia',
 'law',
 'leader',
 'liberal',
 'magnagerialism',
 'mandate',
 'media',
 'ministerialadvisers',
 'minist ',
 'multicultural',
 'national identity',
 'national identity',
 'obligation',
 'others',
 'otherness',
 'parliament',
 'participation',
 'party',
 'parties',
 'polic',
 'political socialisation',
 'populism',
 'post-modern',
 'power',
 'protest',
 '

In [9]:
from functools import reduce
from pyspark.sql.functions import when, col, instr

# Assume `df` is the dataframe that contains the data
column_name = 'text'
new_column_name = 'is_political'


# Apply `instr` for each keyword and use `reduce` to check if any of them are present
condition = reduce(lambda a, b: a | b, [instr(col(column_name), kw) > 0 for kw in words])

# Use `when` to set the value of the new column based on the condition
df_drop_no_geo_eng_withPolCol = df_drop_no_geo_eng.withColumn(new_column_name, when(condition, 1).otherwise(0))

In [10]:
from pyspark.sql.functions import substring, to_date

# Assuming your DataFrame is named `df` and the string column is named `string_col`
df_drop_no_geo_eng_withPolCol_date = df_drop_no_geo_eng_withPolCol.withColumn("date", to_date(substring("created_at", 1, 10)))


In [11]:
df_drop_no_geo_eng_withPolCol_date.show(5)

+-------------------+--------------------+--------------------+----+--------------------+-------------------+------------+----------+
|                _id|          created_at|            includes|lang|                text|          author_id|is_political|      date|
+-------------------+--------------------+--------------------+----+--------------------+-------------------+------------+----------+
|1491567527322808321|2022-02-10T00:19:...|{"places":[{"full...|  en|My life is hittin...|           45472006|           0|2022-02-10|
|1491693811663515654|2022-02-10T08:41:...|{"places":[{"full...|  en|@TobyRayEnglish @...| 952342256823943168|           0|2022-02-10|
|1491674087378219009|2022-02-10T07:23:...|{"places":[{"full...|  en|@JadeArchaeobot @...|           25033901|           0|2022-02-10|
|1491721359587627008|2022-02-10T10:31:...|{"places":[{"full...|  en|@JadeArchaeobot @...|         3103790508|           0|2022-02-10|
|1491595343838220291|2022-02-10T02:10:...|{"places":[{"full...

In [12]:
from pyspark.sql.functions import regexp_extract

# Assuming your DataFrame is named `df` and the string column is named `string_col`
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date.withColumn("coord", regexp_extract("includes", r"(-?\d+\.\d+,-?\d+\.\d+,-?\d+\.\d+,-?\d+\.\d+)", 1))


In [13]:
from pyspark.sql.functions import split

# Assuming your DataFrame is named `df` and the string column is named `string_col`
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.withColumn("x1", split("coord", ",").getItem(0).cast('float'))
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.withColumn("y1", split("coord", ",").getItem(1).cast('float'))
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.withColumn("x2", split("coord", ",").getItem(2).cast('float'))
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.withColumn("y2", split("coord", ",").getItem(3).cast('float'))

In [14]:
from pyspark.sql.functions import mean

# Assuming your DataFrame is named `df` and the two columns are named `col1` and `col2`
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.withColumn("x_cent", (df_drop_no_geo_eng_withPolCol_date_coord["x1"] + df_drop_no_geo_eng_withPolCol_date_coord["x2"]) / 2)
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.withColumn("y_cent", (df_drop_no_geo_eng_withPolCol_date_coord["y1"] + df_drop_no_geo_eng_withPolCol_date_coord["y2"]) / 2)

In [16]:
df_drop_no_geo_eng_withPolCol_date_coord = df_drop_no_geo_eng_withPolCol_date_coord.select('_id', 'date', 'x_cent', 'y_cent', 
                     'text', 'author_id', 'is_political')

In [27]:
# Assuming your DataFrame is named `df` and you want to save it in a directory named `output_directory`
df_drop_no_geo_eng_withPolCol_date_coord.write.format("parquet").mode("overwrite").save("../data/curated/Twitter/twitter.parquet")

In [25]:
import pandas as pd

df = pd.read_parquet('../data/curated/Twitter/twitter.parquet')

In [26]:
df

Unnamed: 0,_id,date,x_cent,y_cent,text,author_id,is_political
0,1491567527322808321,2022-02-10,150.520142,-23.339474,My life is hitting a big change soon. Keen to ...,45472006,0
1,1491693811663515654,2022-02-10,153.369354,-27.954222,@TobyRayEnglish @MikeDel21893959 @aSinister @b...,952342256823943168,0
2,1491674087378219009,2022-02-10,151.926880,-27.573589,@JadeArchaeobot @HarvardGSAS oh ewwww,25033901,0
3,1491721359587627008,2022-02-10,115.928314,-32.150101,@JadeArchaeobot @HarvardGSAS I'm so sorry you ...,3103790508,0
4,1491595343838220291,2022-02-10,150.931976,-33.848244,@BehnamAkhavan @EngAustralia @Eng_IT_Sydney @S...,1249497357944672257,0
...,...,...,...,...,...,...,...
2517272,1557517139002925056,2022-08-10,144.954147,-37.824257,Day two has kicked off #EduTECHAU!\n\nMeet our...,1468040114781655040,0
2517273,1557448918329266176,2022-08-10,152.993195,-27.382143,@bluboy43 @TaikaWaititi People will still like...,20742804,0
2517274,1557499571642593280,2022-08-10,150.931976,-33.848244,@AMCELL @puck_fair What’s happening here?,1010068200,0
2517275,1557502623947030528,2022-08-10,150.931976,-33.848244,@AMCELL @puck_fair That’s sad.,1010068200,0


In [38]:
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np

def get_sentiment(x):

    sia = SentimentIntensityAnalyzer()

    sia_out = sia.polarity_scores(x)

    neg = sia_out['neg']
    pos = sia_out['pos']
    compound = sia_out['compound']

    sentiment = np.argmax({-1:sia_out['neg'], 0: sia_out['neu'], 1:sia_out['pos']})

    return neg, pos, compound, sentiment

In [39]:
df[["neg_score", "pos_score", "compound_score", "sentiment"]] = df["text"].apply(lambda x: pd.Series(get_sentiment(x)))

In [40]:
df.to_parquet('../data/curated/twitter_with_sent.parquet')

In [50]:
df[df['is_political']==1]

Unnamed: 0,_id,date,x_cent,y_cent,text,author_id,is_political,neg_score,pos_score,compound_score,sentiment
8,1491795863018274816,2022-02-10,152.993195,-27.382143,@IMmadashellandi @HeshmatAlavi Israel is a dem...,4516171573,1,0.228,0.000,-0.7955,0.0
15,1491725195375439877,2022-02-10,152.993195,-27.382143,"@stilllukebowden More power to Barbara, Sue an...",177543302,1,0.000,0.000,0.0000,0.0
24,1491648779082100739,2022-02-10,150.931976,-33.848244,@Dr_eVal @Sg3912 @parsleysmum @ardentgreeneyes...,987281520099344384,1,0.107,0.030,-0.6876,0.0
31,1491626908106190848,2022-02-10,145.053131,-37.972565,@sammyjcomedian @NikkiKlopfer Another reclaime...,418231895,1,0.000,0.372,0.8360,0.0
53,1491588166888742912,2022-02-10,145.053131,-37.972565,@MattGurchenko @MrInbetween1 @Garyandthegoat1 ...,1466177366506549251,1,0.000,0.000,0.0000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2517233,1557514633623191553,2022-08-10,152.993195,-27.382143,Trump Attorney wasn’t allowed presence when KG...,1511175941846102021,1,0.144,0.000,-0.7096,0.0
2517243,1557515011781640193,2022-08-10,152.400208,-27.636768,Mod parties be like https://t.co/jm21gF9qIy,803641382,1,0.000,0.634,0.6369,0.0
2517255,1557515835237072896,2022-08-10,144.952393,-37.824680,"""Teacher's need to start where they're comfort...",1074293586,1,0.000,0.000,0.0000,0.0
2517256,1557515963662077953,2022-08-10,152.993195,-27.382143,Laid back sleeping ordinary undecided conserv...,1511175941846102021,1,0.118,0.000,-0.4710,0.0


In [55]:
df['pos_sentiment'] = df['pos_score']>df['neg_score']

In [56]:
df

Unnamed: 0,_id,date,x_cent,y_cent,text,author_id,is_political,neg_score,pos_score,compound_score,sentiment,pos_sentiment
0,1491567527322808321,2022-02-10,150.520142,-23.339474,My life is hitting a big change soon. Keen to ...,45472006,0,0.052,0.244,0.7412,0.0,True
1,1491693811663515654,2022-02-10,153.369354,-27.954222,@TobyRayEnglish @MikeDel21893959 @aSinister @b...,952342256823943168,0,0.000,0.067,0.4767,0.0,True
2,1491674087378219009,2022-02-10,151.926880,-27.573589,@JadeArchaeobot @HarvardGSAS oh ewwww,25033901,0,0.000,0.000,0.0000,0.0,False
3,1491721359587627008,2022-02-10,115.928314,-32.150101,@JadeArchaeobot @HarvardGSAS I'm so sorry you ...,3103790508,0,0.162,0.000,-0.1880,0.0,False
4,1491595343838220291,2022-02-10,150.931976,-33.848244,@BehnamAkhavan @EngAustralia @Eng_IT_Sydney @S...,1249497357944672257,0,0.000,0.406,0.6249,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2517272,1557517139002925056,2022-08-10,144.954147,-37.824257,Day two has kicked off #EduTECHAU!\n\nMeet our...,1468040114781655040,0,0.000,0.275,0.9167,0.0,True
2517273,1557448918329266176,2022-08-10,152.993195,-27.382143,@bluboy43 @TaikaWaititi People will still like...,20742804,0,0.000,0.200,0.3612,0.0,True
2517274,1557499571642593280,2022-08-10,150.931976,-33.848244,@AMCELL @puck_fair What’s happening here?,1010068200,0,0.000,0.000,0.0000,0.0,False
2517275,1557502623947030528,2022-08-10,150.931976,-33.848244,@AMCELL @puck_fair That’s sad.,1010068200,0,0.508,0.000,-0.4767,0.0,False


In [61]:
import geopandas as gpd
from shapely.geometry import Point

# Load the GeoPandas polygon data
polygon_gdf = gpd.read_file("../data/raw/Geography/2021_ELB_region.shp")


In [62]:
polygon_gdf

Unnamed: 0,E_div_numb,Elect_div,Numccds,Actual,Projected,Total_Popu,Australian,Area_SqKm,Sortname,geometry
0,1,Banks,359,0,0,0,0,49.47,Banks,"POLYGON Z ((151.12967 -33.97363 0.00000, 151.1..."
1,2,Barton,386,0,0,0,0,39.65,Barton,"POLYGON Z ((151.17424 -33.92497 0.00000, 151.1..."
2,3,Bennelong,364,0,0,0,0,58.76,Bennelong,"POLYGON Z ((151.15908 -33.79871 0.00000, 151.1..."
3,4,Berowra,389,0,0,0,0,741.64,Berowra,"POLYGON Z ((151.28480 -33.57221 0.00000, 151.2..."
4,5,Blaxland,419,0,0,0,0,61.16,Blaxland,"POLYGON Z ((151.04441 -33.84205 0.00000, 151.0..."
...,...,...,...,...,...,...,...,...,...,...
146,11,O'Connor,543,114035,113375,0,0,1126936.75,O'Connor,"MULTIPOLYGON Z (((129.00205 -30.76778 0.00000,..."
147,12,Pearce,345,106306,118884,0,0,782.75,Pearce,"POLYGON Z ((115.98284 -31.74189 0.00000, 115.9..."
148,13,Perth,394,116242,118518,0,0,79.92,Perth,"POLYGON Z ((115.96432 -31.88769 0.00000, 115.9..."
149,14,Swan,418,114942,117373,0,0,150.89,Swan,"POLYGON Z ((116.04102 -31.98350 0.00000, 116.0..."


In [63]:
# Convert the coordinate column to a Point GeoSeries
df["point"] = df.apply(lambda row: Point(row["x_cent"], row["y_cent"]), axis=1)

  arr = construct_1d_object_array_from_listlike(values)


In [66]:
def get_Electorate(coord):
    for idx, polygon in polygon_gdf.iterrows():
        if polygon.geometry.contains(coord):
            return polygon['Elect_div']
    
    return None

In [67]:
df["electorate"] = df["point"].apply(get_Electorate)

23/05/03 04:03:59 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 919013 ms exceeds timeout 120000 ms
23/05/03 04:04:00 WARN SparkContext: Killing executors is not supported by current scheduler.


KeyboardInterrupt: 

23/05/03 12:40:01 WARN TransportChannelHandler: Exception in connection from /192.168.3.86:50048
java.io.IOException: Operation timed out
	at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
	at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
	at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
	at sun.nio.ch.IOUtil.read(IOUtil.java:192)
	at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:379)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:258)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java

In [None]:
df.to_parquet('../data/curated/twitter_with_sent_electorate.parquet')