In [None]:
#!pip install pyspark

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [None]:
conf = SparkConf().setMaster("local[*]").set("spark.executer.memory", "2g")

sc = SparkContext(conf=conf)
spark = SparkSession(sc).builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [None]:

# Assuming your file is located in the root directory of your Google Drive
file_path = '/content/drive/My Drive/yelp-dataset-001/yelp_user.csv'

df = spark.read.format("csv").option("header", "true").option("multiline","true").load(file_path)
df.printSchema()
df.show(1)


root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- friends: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- cool: string (nullable = true)
 |-- fans: string (nullable = true)
 |-- elite: string (nullable = true)
 |-- average_stars: string (nullable = true)
 |-- compliment_hot: string (nullable = true)
 |-- compliment_more: string (nullable = true)
 |-- compliment_profile: string (nullable = true)
 |-- compliment_cute: string (nullable = true)
 |-- compliment_list: string (nullable = true)
 |-- compliment_note: string (nullable = true)
 |-- compliment_plain: string (nullable = true)
 |-- compliment_cool: string (nullable = true)
 |-- compliment_funny: string (nullable = true)
 |-- compliment_writer: string (nullable = true)
 |-- compliment_photos: string (nullable = true)

+--------------------+-----+---------

In [None]:
# Find unique values of the "elite" column
unique_elite_values = df.select("elite").distinct()

# Show the unique values
unique_elite_values.show()


+--------------------+
|               elite|
+--------------------+
|2015, 2014, 2013,...|
|2013, 2014, 2009,...|
|2012, 2009, 2015,...|
|2016, 2015, 2017,...|
|2013, 2017, 2014,...|
|2009, 2007, 2008,...|
|2015, 2016, 2010,...|
|2009, 2012, 2013,...|
|2011, 2008, 2009,...|
|          2017, 2010|
|2012, 2008, 2009,...|
|2016, 2015, 2012,...|
|2010, 2015, 2008,...|
|2013, 2015, 2011,...|
|2009, 2011, 2008,...|
|2012, 2017, 2013,...|
|2012, 2010, 2011,...|
|2013, 2010, 2011,...|
|2014, 2009, 2010,...|
|2011, 2017, 2010,...|
+--------------------+
only showing top 20 rows



In [None]:
from pyspark.sql.functions import explode, split, col

# Split the values in the 'elite' column and explode them into separate rows
df2 = df.withColumn("elite_years", split(col("elite"), ",\s*")).select("user_id", explode("elite_years").alias("elite_year"))

# Group by 'elite_year' and count the occurrences
elite_users_by_year = df2.groupBy("elite_year").count().orderBy("elite_year")

# Show the results
elite_users_by_year.show()

+----------+-------+
|elite_year|  count|
+----------+-------+
|      2005|    140|
|      2006|    887|
|      2007|   2363|
|      2008|   3621|
|      2009|   6536|
|      2010|  10485|
|      2011|  13185|
|      2012|  17777|
|      2013|  19841|
|      2014|  20488|
|      2015|  26018|
|      2016|  30856|
|      2017|  34928|
|      None|1265282|
+----------+-------+



In [None]:
from pyspark.sql.functions import when, col

# Add a new column 'elite_flag' with value 1 if 'elite' column is not "None", otherwise 0
df = df.withColumn("elite_flag", when(col("elite") == "None", 0).otherwise(1))

# Show the DataFrame with the new column
df.show(1)


+--------------------+-----+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+----------+
|             user_id| name|review_count|yelping_since|             friends|useful|funny|cool|fans|elite|average_stars|compliment_hot|compliment_more|compliment_profile|compliment_cute|compliment_list|compliment_note|compliment_plain|compliment_cool|compliment_funny|compliment_writer|compliment_photos|elite_flag|
+--------------------+-----+------------+-------------+--------------------+------+-----+----+----+-----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+----------+
|JJ-aSuM4pCFPdkfoZ...|Chris|          10|   2013-09-24|

In [None]:
from pyspark.sql.functions import col

# Count the occurrences of each value in the "elite_flag" column
elite_flag_counts = df.groupBy("elite_flag").count()

# Show the counts
elite_flag_counts.show()


+----------+-------+
|elite_flag|  count|
+----------+-------+
|         1|  60818|
|         0|1265282|
+----------+-------+



In [None]:


file_path = '/content/drive/My Drive/yelp-dataset-001/yelp_review.csv'

dfr = spark.read.format("csv").option("header", "true").option("multiline","true").load(file_path)
dfr.printSchema()
dfr.show(1)

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- cool: string (nullable = true)

+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+
|           review_id|             user_id|         business_id|stars|      date|                text|useful|funny|cool|
+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+
|vkVSCC7xljjrAI4UG...|bv2nCi5Qv5vroFiqK...|AEx2SYEUJmTxVVB18...|    5|2016-05-28|Super simple plac...|     0|    0|   0|
+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+
only showing top 1 row



# Import Review Data


In [None]:
# Join the dataframes on the "user_id" column
dfr = dfr.join(df.select("user_id", "elite_flag"), on="user_id", how="inner")

# Select the "elite_flag" column from the "df" dataframe
#elite_flags = joined_df.select("elite_flag")

# Show the resulting DataFrame
#elite_flags.show()


In [None]:
dfr.show(1)

+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+----------+
|             user_id|           review_id|         business_id|stars|      date|                text|useful|funny|cool|elite_flag|
+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+----------+
|--CJT4d-S8UhwqHe0...|bklQqMXHT-JYiVQdB...|CHbbzfGr69CQV1fh6...|    1|2015-08-18|I took my motorcy...|     3|    3|   0|         0|
+--------------------+--------------------+--------------------+-----+----------+--------------------+------+-----+----+----------+
only showing top 1 row



# Text Generation

In [None]:
import IPython
# Set the data rate limit to a higher value, for example 10MB/s
IPython.get_ipython().config.IPKernelApp.iopub_data_rate_limit = 10000000


In [None]:
from collections import defaultdict
import random

def markov_chain(text):
    '''The input is a string of text and the output will be a dictionary with each word as
       a key and each value as the list of words that come after the key in the text.'''

    # Tokenize the text by word, though including punctuation
    words = text.split(' ')

    # Initialize a default dictionary to hold all of the words and next words
    m_dict = defaultdict(list)

    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[:-1], words[1:]):
        m_dict[current_word].append(next_word)

    # Convert the default dict back into a dictionary
    m_dict = dict(m_dict)
    return m_dict

####################
# Function to sample a fraction of the data efficiently
def efficient_sample(dataframe, fraction):
    # Calculate the number of records to sample
    num_records = int(dataframe.count() * fraction)

    # Randomly sample the DataFrame using limit
    sampled_data = dataframe.limit(num_records)
    return sampled_data

# Sample a fraction of the data for testing efficiently
elite_sample = efficient_sample(dfr.filter(dfr.elite_flag == 1).select("clean_text_column"), fraction=0.01)
non_elite_sample = efficient_sample(dfr.filter(dfr.elite_flag == 0).select("clean_text_column"), fraction=0.01)
# Flatten the nested lists in the 'clean_text_column' column
elite_text = ' '.join(elite_sample.toPandas()['clean_text_column'].explode())
non_elite_text = ' '.join(non_elite_sample.toPandas()['clean_text_column'].explode())

# Convert DataFrame to Pandas DataFrame to avoid collecting large amounts of data
#elite_text = ' '.join(elite_sample.toPandas()['clean_text_column'])
#non_elite_text = ' '.join(non_elite_sample.toPandas()['clean_text_column'])

# Write the text data to a file
with open('elite_text.txt', 'w') as f:
    f.write(elite_text)

with open('non_elite_text.txt', 'w') as f:
    f.write(non_elite_text)


# Generate Markov chains for elite and non-elite text
#elite_markov_chain = markov_chain(elite_text)
#non_elite_markov_chain = markov_chain(non_elite_text)

# Print sample Markov chain dictionaries
#print("Markov chain for elite users:\n", elite_markov_chain)
#print("\nMarkov chain for non-elite users:\n", non_elite_markov_chain)


In [None]:
# Read the text data from the files
with open('elite_text.txt', 'r') as f:
    elite_text = f.read()

with open('non_elite_text.txt', 'r') as f:
    non_elite_text = f.read()

# Generate Markov chains for elite and non-elite text
elite_markov_chain = markov_chain(elite_text)
non_elite_markov_chain = markov_chain(non_elite_text)


In [None]:
def print_markov_chain_summary(markov_chain):
    for word, next_words in markov_chain.items():
        print(f"Word: {word}, Next Words Count: {len(next_words)}")

# Print summary of Markov chains for elite and non-elite users
print("Summary of Markov chain for elite users:")
print_markov_chain_summary(elite_markov_chain)
print("\nSummary of Markov chain for non-elite users:")
print_markov_chain_summary(non_elite_markov_chain)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Word: puh, Next Words Count: 1
Word: schwitzen, Next Words Count: 1
Word: kugel, Next Words Count: 1
Word: schlechter, Next Words Count: 1
Word: danke, Next Words Count: 1
Word: komm, Next Words Count: 2
Word: geehrten, Next Words Count: 1
Word: exklusiv, Next Words Count: 1
Word: brezel, Next Words Count: 2
Word: präsentieren, Next Words Count: 1
Word: gewinner, Next Words Count: 2
Word: jury, Next Words Count: 1
Word: einstimmig, Next Words Count: 1
Word: vollendete, Next Words Count: 1
Word: handwerkskunst, Next Words Count: 1
Word: natürlichste, Next Words Count: 1
Word: anmutung, Next Words Count: 1
Word: haptik, Next Words Count: 1
Word: franckschen, Next Words Count: 1
Word: verführen, Next Words Count: 1
Word: schwärmen, Next Words Count: 1
Word: verneigen, Next Words Count: 1
Word: überreichen, Next Words Count: 1
Word: titel, Next Words Count: 1
Word: voller, Next Words Count: 1
Word: verdienten, Next Words Coun

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import StringType
import random

# Initialize SparkSession
#spark = SparkSession.builder \
#    .appName("Sentence Generator") \
#    .getOrCreate()

# Function to generate a sentence
def generate_sentence(corpus_dict, num_words):
    sentence = []
    word = random.choice(list(corpus_dict.keys()))  # Start with a random word from the corpus
    sentence.append(word)

    while len(sentence) < num_words:
        next_words = corpus_dict.get(word, ["."])
        next_word = random.choice(next_words)  # Get next word based on current word
        if next_word == ".":
            break
        sentence.append(next_word)
        word = next_word

        # If we reach the end of the corpus for a word, break the loop
        if word not in corpus_dict:
            break

    return " ".join(sentence)

In [None]:
# REPEAT THIS CHUNK AS MANY TIMES AS YOU WANT FOR DIFFERENT EXAMPLES!

# Convert the function to a PySpark UDF
generate_sentence_udf_elite = udf(lambda x: generate_sentence(elite_markov_chain, x), StringType())

generate_sentence_udf_nonelite = udf(lambda x: generate_sentence(non_elite_markov_chain, x), StringType())

# Create a DataFrame with a single column containing the desired number of words

num_words = 10 # Edit this number at will
num_words_df = spark.range(1).withColumn("num_words", lit(num_words))

# Generate sentences based on the number of words
# Elite
sentence_df_elite = num_words_df.withColumn("sentence", generate_sentence_udf_elite("num_words"))
sentence_df_elite.show(truncate=False)
# Non-Elite
sentence_df_nonelite = num_words_df.withColumn("sentence", generate_sentence_udf_nonelite("num_words"))
sentence_df_nonelite.show(truncate=False)

# Stop SparkSession
#spark.stop()

+---+---------+----------------------------------------------------------------------+
|id |num_words|sentence                                                              |
+---+---------+----------------------------------------------------------------------+
|0  |10       |backflow valve torn apart seeing coupon estimated price chipotle sauce|
+---+---------+----------------------------------------------------------------------+

+---+---------+-------------------------------------------------------------+
|id |num_words|sentence                                                     |
+---+---------+-------------------------------------------------------------+
|0  |10       |wartende menge gespart wird nicht zu erreichen aber mir dafür|
+---+---------+-------------------------------------------------------------+

+---+---------+--------------------------------------------------------------------+
|id |num_words|sentence                                                            |
+--