In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Dataframe Operations").getOrCreate()

In [None]:
import datetime
from functools import reduce
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import avg, col, count, first, lit, lower, month, regexp_replace, row_number, rtrim, trim, to_timestamp, year, when
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType
from pyspark.sql.window import Window

In [None]:
# Define the schema for the employee DataFrame
employee_schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("emp_name", StringType(), True),
    StructField("job_name", StringType(), True),
    StructField("manager_id", IntegerType(), True),
    StructField("hire_date", DateType(), True),
    StructField("salary", DoubleType(), True),
    StructField("commission", DoubleType(), True),
    StructField("dept_id", IntegerType(), True)
])
# Create the employee DataFrame using the provided data and schema
employee_data = [
    (1, "Rohit", "Data Engineer", None, datetime.date(2020, 1, 1), 60000.0, None, 1),
    (2, "Deep", "Manager", 1, datetime.date(2020, 2, 15), 45000.0, None, 1),
    (3, "Ravi", "Analyst", 1, datetime.date(2020, 3, 10), 55000.0, None, 1),
    (4, "Deepak", "Developer", 1, datetime.date(2020, 1, 20), 48000.0, None, 2),
    (5, None, "Developer", 1, datetime.date(2021, 2, 5), 52000.0, None, 2),
    (6, "Prakash", "Manager", None, datetime.date(2021, 4, 1), 65000.0, None, None),
    (7, "Olivia", "Data Engineer", 6, datetime.date(2021, 6, 15), 58000.0, None, 3),
    (8, "James", "Data Engineer", 6, datetime.date(2022, 3, 25), 47000.0, None, 3)
]

employee_df = spark.createDataFrame(data=employee_data, schema=employee_schema)
employee_df.show(truncate=False)

department = [(1, "IT", "New York"), (2, "HR", "San Francisco"), (3, "Marketing", "Los Angeles"), (None, "Sales", "Chicago")]
department_columns = ["dept_id", "dept_name", "dept_location"]
department_df = spark.createDataFrame(data=department, schema=department_columns)
department_df.show(truncate=False)


In [5]:
from datetime import datetime
import json
import pandas as pd
from tqdm import tqdm

In [4]:
# file_path = "C:\\Users\\rohitpandey02\\OneDrive - Nagarro\\Desktop\\toys_and_games.json"
file_path = "C:\\Users\\rohitpandey02\\OneDrive - Nagarro\\Desktop\\Toys_and_Games_5.json"
data = []
with open(file_path, "r") as f:
    for line in tqdm(f):
        data.append(json.loads(line))
df = pd.DataFrame(data)

1828971it [00:39, 46053.69it/s]


In [7]:
df.dtypes

overall           float64
vote               object
verified             bool
reviewTime         object
reviewerID         object
asin               object
style              object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
image              object
dtype: object

In [8]:
len(df)

1828971

In [47]:
df['reviewTime'] = df['reviewTime'].astype(str)
filtered_df = df[df['reviewTime'].str.contains('2005|2006|2007|2008|2009|2010')]


In [48]:
len(filtered_df)

36238

In [49]:
json_file = "C:\\office\\rohit-workspace\\assignments\\PySpark\\temp_toys_games.json"
filtered_df.to_json('filtered_data.json', orient='records', lines=True)
