## File Combination ##

In [13]:
#preparation
import findspark
findspark.init("/usr/local/spark/")

from pyspark.sql import SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("experiment") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()

In [14]:
#building the schema for our pyspark dataframe
import pyspark.sql.functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(spark)

customSchema = StructType([ \
StructField("text", StringType(), True), \
StructField("date", StringType(), True), \
StructField("location", StringType(), True), \
StructField("polarity", StringType(), True), \
StructField("subjectivity", StringType(), True), \
StructField("classification", StringType(), True)])

In [15]:
#combining all csv-files of the Johnson-World-tweets into one pyspark dataframe
try:
    import os
    os.chdir("/home/jovyan")
    fullpath_jnj = './combined_johnson&johnson_worldwide/*.csv'

    df_jj = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath_jnj) \
    
    df_jj.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------+--------+------------+--------------+
|                text|                date|      location|polarity|subjectivity|classification|
+--------------------+--------------------+--------------+--------+------------+--------------+
|💉 Community phar...|Fri Jul 02 08:04:...|Kerry, Ireland|     0.0|         0.0|       Neutral|
|⚡️ Pharmacies can...|                null|          null|    null|        null|          null|
| on Morning Ireland"|Fri Jul 02 08:04:...|       Ireland|     0.0|         0.0|       Neutral|
|💉 And then from ...|                null|          null|    null|        null|          null|
|18-34 year olds c...|Fri Jul 02 08:05:...|       Ireland|     0.0|         0.0|       Neutral|
+--------------------+--------------------+--------------+--------+------------+--------------+
only showing top 5 rows



In [16]:
#combining all csv-files of the Moderna-World-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath_moderna = './combined_moderna_worldwide/*.csv'

    df_moderna = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath_moderna) \

    df_moderna.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+--------+------------+--------------+
|                text|                date|            location|polarity|subjectivity|classification|
+--------------------+--------------------+--------------------+--------+------------+--------------+
|The Pfizer-BioNTe...|Mon Jun 28 14:43:...|Johannesburg, Sou...|     0.5|         0.5|      Positive|
|   🤞🏾this is true.|                null|                null|    null|        null|          null|
|Pfizer and Modern...|Mon Jun 28 14:43:...|       United States|    0.35|        0.65|      Positive|
|The Pfizer-BioNTe...|Mon Jun 28 14:43:...|              Mexico|     0.5|         0.5|      Positive|
|Great news — Pfiz...|Mon Jun 28 14:43:...|               World|     0.8|        0.75|      Positive|
+--------------------+--------------------+--------------------+--------+------------+--------------+
only showing top 5 rows



In [17]:
#combining all csv-files of the Pfizer-World-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath_pfizer = './combined_pfizer_worldwide/*.csv'

    df_pfizer = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath_pfizer) \

    df_pfizer.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+--------------------+-------------------+--------------+
|                text|                date|            location|            polarity|       subjectivity|classification|
+--------------------+--------------------+--------------------+--------------------+-------------------+--------------+
|Pfizer and Modern...|Tue Jun 29 08:14:...|            Houilles| 0.13636363636363635|0.45454545454545453|      Positive|
|About half of adu...|Tue Jun 29 08:15:...|      Greenville, SC|-0.16666666666666666|0.16666666666666666|      Negative|
|I booked in two P...|Tue Jun 29 08:15:...|Bathtub full of piss|                 0.0|                0.0|       Neutral|
|Rollout of the se...|Tue Jun 29 08:15:...|Pretoria, South A...|           -0.078125|           0.421875|      Negative|
|                   .|                null|                null|                null|               null|          null|
+--------------------+----------

In [18]:
#combining all csv-files of the AstraZeneca-World-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath_AstraZeneca = './combined_astra_worldwide/*.csv'

    df_astraz = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath_AstraZeneca) \

    df_astraz.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+--------------------+-------------------+--------------+
|                text|                date|            location|            polarity|       subjectivity|classification|
+--------------------+--------------------+--------------------+--------------------+-------------------+--------------+
|Australia is the ...|                null|                null|                null|               null|          null|
|vaccinations Astr...|Mon Jun 28 10:01:...|Terrigal, Central...|                 0.0|                0.0|       Neutral|
|Less than a quart...|                null|                null|                null|               null|          null|
|Disgraceful negle...|Mon Jun 28 10:01:...| Melbourne, Victoria|-0.16666666666666666|0.06666666666666667|      Negative|
|Breaking National...|Mon Jun 28 10:02:...|Melbourne / Kuala...|                 0.0|                0.0|       Neutral|
+--------------------+----------

In [19]:
try:
    os.chdir("/home/jovyan/dp2-2021s-teamboehm/Test/TestCombine")
    df_jj.toPandas().to_csv('johnson_combined.csv', index = False)
    df_moderna.toPandas().to_csv('moderna_combined.csv', index = False)
    df_pfizer.toPandas().to_csv('pfizer_combined.csv', index = False)
    df_astraz.toPandas().to_csv('astraz_combined.csv', index = False)
except: 
    print("Unexpected error:", sys.exc_info()[0])

In [20]:
#combining all csv-files of the pfizer-UK-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath = './combined_pfizer_UK/*.csv'

    pfizer_UK_df = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath) \
       #.withColumn("filename", input_file_name())

    pfizer_UK_df.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+------------------+------------------+--------------+
|                text|                date|            location|          polarity|      subjectivity|classification|
+--------------------+--------------------+--------------------+------------------+------------------+--------------+
|My colleague and ...|Fri Jul 02 09:00:...|Scotland, United ...|           -0.0625|            0.1875|      Negative|
|The number of tho...|Fri Jul 02 09:01:...|                  UK|             -0.05|               0.7|      Negative|
|Is now a fully va...|                null|                null|              null|              null|          null|
|fridaymorning Pfi...|Fri Jul 02 09:01:...|          Manchester|0.7083333333333334|0.7416666666666667|      Positive|
|Do I cop the Irn ...|Fri Jul 02 09:01:...|   Glasgow, Scotland|               0.0|               0.0|       Neutral|
+--------------------+--------------------+-------------

In [21]:
#combining all csv-files of the AstraZeneca-UK-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath = './combined_astra_UK/*.csv'

    astra_UK_df = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath) \
       #.withColumn("filename", input_file_name())

    astra_UK_df.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+-------------------+------------------+--------------+
|                text|                date|            location|           polarity|      subjectivity|classification|
+--------------------+--------------------+--------------------+-------------------+------------------+--------------+
|Boost for the COV...|Mon Jun 28 11:03:...|     London, England|0.19318181818181818|0.3939393939393939|      Positive|
|Research by the U...|Mon Jun 28 11:03:...|          London, UK|                0.1|               0.3|      Positive|
|Research by the U...|Mon Jun 28 11:03:...|Middlesbrough, En...|               0.05|              0.15|      Positive|
|Research by the U...|Mon Jun 28 11:03:...|  Sheffield, England|               0.05|              0.15|      Positive|
|Research by the U...|Mon Jun 28 11:03:...| Edinburgh, Scotland|               0.05|              0.15|      Positive|
+--------------------+--------------------+-----

In [22]:
#combining all csv-files of the Pfizer-US-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath = './combined_pfizer_US/*.csv'

    pfizer_US_df = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath) \
       #.withColumn("filename", input_file_name())

    pfizer_US_df.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+------------+--------+------------+--------------+
|                text|                date|    location|polarity|subjectivity|classification|
+--------------------+--------------------+------------+--------+------------+--------------+
|Vaccine appointme...|                null|        null|    null|        null|          null|
|Walgreens/Duane R...|                null|        null|    null|        null|          null|
|122 East 42nd Street|           Manhattan|          NY|   10168|        null|          null|
|                (8…"|Fri Jul 02 10:17:...|New York, NY|     0.4|         0.4|      Positive|
|Vaccine appointme...|                null|        null|    null|        null|          null|
+--------------------+--------------------+------------+--------+------------+--------------+
only showing top 5 rows



In [23]:
#combining all csv-files of the AstraZeneca-US-tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath = './combined_astra_US/*.csv'

    astra_US_df = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath) \
       #.withColumn("filename", input_file_name())

    astra_US_df.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|                text|                date|            location|            polarity|      subjectivity|    classification|
+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|Here we go again....|Mon Jun 28 10:55:...|          Boulder CO|0.041666666666666685|0.3333333333333333|          Positive|
|The virus is here...|                null|                null|                null|              null|              null|
|            Recently| 's press release...|Mon Jun 28 10:57:...|          New Jersey|0.0909090909090909|0.4166666666666667|
|Here we go again....|Mon Jun 28 10:57:...|         Atlanta, GA|0.041666666666666685|0.3333333333333333|          Positive|
|6) wouldn’t it be...|Mon Jun 28 10:59:...|       United States|                 0.7|               0.9|          Positive|
+-------

In [24]:
try:  
    os.chdir("/home/jovyan/dp2-2021s-teamboehm/Test/TestCombine")
    pfizer_UK_df.toPandas().to_csv('pfizer_UK_combined.csv', index = False)
    astra_UK_df.toPandas().to_csv('astra_UK_combined.csv', index = False)
    pfizer_US_df.toPandas().to_csv('pfizer_US_combined.csv', index = False)
    astra_US_df.toPandas().to_csv('astra_US_combined.csv', index = False)
except: 
    print("Unexpected error:", sys.exc_info()[0])

In [27]:
#combining all csv-files of all worldwide tweets into one pyspark dataframe
try:
    os.chdir("/home/jovyan")
    fullpath = './combined_all_worldwide/*.csv'

    # read all csv files in the folder into a single pyspark dataframe
    worldwide_combined_df = spark.read.format("csv") \
       .option("header", "false") \
       .option("sep",",") \
       .schema(customSchema) \
       .load(fullpath)
    
    worldwide_combined_df.show(5)
except: 
    print("Unexpected error:", sys.exc_info()[0])

+--------------------+--------------------+--------------------+--------+------------+--------------+
|                text|                date|            location|polarity|subjectivity|classification|
+--------------------+--------------------+--------------------+--------+------------+--------------+
|The Pfizer-BioNTe...|Mon Jun 28 14:43:...|Johannesburg, Sou...|     0.5|         0.5|      Positive|
|   🤞🏾this is true.|                null|                null|    null|        null|          null|
|Pfizer and Modern...|Mon Jun 28 14:43:...|       United States|    0.35|        0.65|      Positive|
|The Pfizer-BioNTe...|Mon Jun 28 14:43:...|              Mexico|     0.5|         0.5|      Positive|
|Great news — Pfiz...|Mon Jun 28 14:43:...|               World|     0.8|        0.75|      Positive|
+--------------------+--------------------+--------------------+--------+------------+--------------+
only showing top 5 rows



In [28]:
try:  
    os.chdir("/home/jovyan/dp2-2021s-teamboehm/Test/TestCombine")
    worldwide_combined_df.toPandas().to_csv('worldwide_combined.csv', index = False)
except: 
    print("Unexpected error:", sys.exc_info()[0])