In [None]:
from google.colab import drive

In [None]:
import os
import zipfile
from zipfile import ZipFile
from os import path
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, to_timestamp, unix_timestamp,mean, to_date

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/MyDrive/data"

Divvy_Trips_2019_Q4.zip  Divvy_Trips_2020_Q1.zip


In [None]:
def get_file_paths(root_directory, folder_name, file_extension):
    file_paths = []
    file_directory = os.path.join(root_directory, folder_name)
    for root, _, files in os.walk(file_directory):
        for file_name in files:
            if file_extension in file_name:
                file_paths.append(os.path.join(root, file_name))
    return file_paths

In [None]:
def read_zip_files_into_dataframes(zip_file_paths):
    df_list = []
    for file_path in zip_file_paths:
        print(f"Reading for {file_path}")
        zip_file = ZipFile(file_path)
        for text_file in zip_file.infolist():
            if text_file.filename.endswith(
                ".csv"
            ) and not text_file.filename.startswith("__MACOSX"):
                df = pd.read_csv(zip_file.open(text_file.filename))
                df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

In [None]:
def write_to_csv(df, file_path):
  # set mode to overwrite
  df.write.mode("overwrite").csv(file_path, header=True)

In [None]:
current_directory = os.getcwd()

In [None]:
current_directory

'/content'

In [None]:
zip_file_paths = get_file_paths(current_directory, r"/content/drive/MyDrive/data", "zip")

In [None]:
zip_file_paths

['/content/drive/MyDrive/data/Divvy_Trips_2019_Q4.zip',
 '/content/drive/MyDrive/data/Divvy_Trips_2020_Q1.zip']

In [None]:
spark = SparkSession.builder.appName("Exercise6").enableHiveSupport().getOrCreate()

In [None]:
pdf = read_zip_files_into_dataframes(zip_file_paths)

Reading for /content/drive/MyDrive/data/Divvy_Trips_2019_Q4.zip
Reading for /content/drive/MyDrive/data/Divvy_Trips_2020_Q1.zip


In [None]:
sparkDF = spark.createDataFrame(pdf)

In [None]:
sparkDF.show()

+-----------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+-------+-------------+----------+--------+------------------+----------------+----------------+--------------+---------+---------+-------+-------+-------------+
|    trip_id|         start_time|           end_time|bikeid|tripduration|from_station_id|   from_station_name|to_station_id|     to_station_name|  usertype|gender|birthyear|ride_id|rideable_type|started_at|ended_at|start_station_name|start_station_id|end_station_name|end_station_id|start_lat|start_lng|end_lat|end_lng|member_casual|
+-----------+-------------------+-------------------+------+------------+---------------+--------------------+-------------+--------------------+----------+------+---------+-------+-------------+----------+--------+------------------+----------------+----------------+--------------+---------+---------+-------+-------+-------------

In [34]:
def output_average_trip_duration(df, output_file_path):
  df_new = df.withColumn('start_time_ts', unix_timestamp('start_time'))\
                      .withColumn('end_time_ts', unix_timestamp('end_time'))
  df_new = df_new.withColumn('days_diff', col('end_time_ts').cast('long') - col('start_time_ts').cast('long'))
  df_new = df_new.withColumn("day", to_date(col("start_time")))
  df_new = df_new.groupBy("day").agg(mean("days_diff").alias("trip_duration"))
  # select only day and average_trip_duration columns
  df_new = df_new.orderBy("day")
  df_new = df_new.dropna()
  write_to_csv(df_new, output_file_path)


In [35]:
output_average_trip_duration(sparkDF, "/content/drive/MyDrive/reports/report1")
#