Read the final tweets

In [1]:
df = spark.sql("SELECT * FROM twitterdata.final_tweets LIMIT 100")
display(df)

StatementMeta(, 529bb9ba-903f-4f51-bab8-0d349de3b815, 3, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4d9e241d-dcd4-40c3-b699-75a6d9aa49cb)

Load the required libraries

In [2]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from collections import Counter
from pyspark.sql.functions import udf, col, explode, array
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from nltk import word_tokenize, bigrams

StatementMeta(, cb693762-c20e-4568-a283-6a5cde6ecd0e, 4, Finished, Available)

[nltk_data] Downloading package punkt to /home/trusted-service-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Extract the unigrams and the frequencies

In [3]:
# Ensure the 'punkt' tokenizer models are downloaded
nltk.download('punkt')

# Define a UDF to tokenize tweets
def tokenize(text):
    nltk.download('punkt')
    tokens = word_tokenize(text)
    unwanted = ['``',"''","'"]
    tokens = [token for token in tokens if token not in unwanted]
    return tokens

tokenize_udf = udf(tokenize, ArrayType(StringType()))

def unigram_tweets():
    nltk.download('punkt')
    # Load data into a Spark DataFrame
    df = spark.sql("SELECT * FROM twitterdata.final_tweets")
    
    # Apply the UDF to tokenize the tweets
    df = df.withColumn('Tokens', tokenize_udf(col('Tweet')))
    
    # Explode the tokens into individual words
    words_df = df.withColumn('Word', explode(col('Tokens')))
    
    # Group by word and count the frequency
    freq_df = words_df.groupBy('Word').count().withColumnRenamed('count', 'Frequency')
    
    # Sort by frequency in descending order
    freq_df = freq_df.orderBy(col('Frequency').desc())

    # Select the top 20 unigrams
    top_20_unigrams = freq_df.limit(20)

    # Save the result as a Delta table
    top_20_unigrams.write.format('delta').mode('overwrite').saveAsTable('unigrams')
    display(top_20_unigrams)
   

unigram_tweets()


StatementMeta(, cb693762-c20e-4568-a283-6a5cde6ecd0e, 5, Finished, Available)

[nltk_data] Downloading package punkt to /home/trusted-service-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/trusted-service-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


SynapseWidget(Synapse.DataFrame, 39de2a1e-71b8-476f-97d4-b45e13b9c946)

In [8]:

# Define a UDF to generate bigrams
def generate_bigrams(tokens):
    nltk.download('punkt')
    return list(bigrams(tokens))

tokenize_udf = udf(tokenize, ArrayType(StringType()))
bigrams_udf = udf(generate_bigrams, ArrayType(ArrayType(StringType())))

def bigram_tweets():
    # Load data into a Spark DataFrame
    df = spark.sql("SELECT * FROM twitterdata.final_tweets")

    # Apply the UDF to tokenize the tweets
    df = df.withColumn('tokens', tokenize_udf(col('Tweet')))
    
    # Generate bigrams
    df = df.withColumn('bigrams', bigrams_udf(col('tokens')))
    
    # Explode the bigrams into individual rows
    bigrams_df = df.withColumn('bigram', explode(col('bigrams')))
    
    # Group by bigram and count the frequency
    bigram_freq_df = bigrams_df.groupBy('bigram').count().withColumnRenamed('count', 'Frequency')
    
    # Convert the bigram from an array to a string for better readability
    bigram_freq_df = bigram_freq_df.withColumn('bigram', col('bigram').cast(StringType()))
    
    # Filter out specific bigrams
    bigram_freq_df = bigram_freq_df.filter(~col('bigram').isin(str(('herdsmen', "'")), str(('b', "''")), str(('ca', "n't"))))
    
    # Sort by frequency in descending order
    bigram_freq_df = bigram_freq_df.orderBy(col('Frequency').desc())
    
    # Select the top 20 bigrams
    top_20_bigrams = bigram_freq_df.limit(20)
    
    top_20_bigrams.write.format('delta').mode('overwrite').saveAsTable('biagrams')

bigram_tweets()


StatementMeta(, 529bb9ba-903f-4f51-bab8-0d349de3b815, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, b3e45db7-9120-4ae8-b42f-31feb9d72a50)