In [2]:
# Step 1: Install PySpark
!pip install pyspark

# Step 2: Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, mean, min as spark_min, max as spark_max

# Step 3: Start Spark Session
spark = SparkSession.builder \
    .appName("Big Data Analysis with PySpark on Colab") \
    .getOrCreate()

# Step 4: Upload the CSV
from google.colab import files
uploaded = files.upload()

# Step 5: Load CSV into Spark DataFrame
# Replace the filename with the one you've uploaded
df = spark.read.csv("Life Expectancy Data.csv", header=True, inferSchema=True)


-

# Step 6: Basic Exploration
print("Schema:")
df.printSchema()

print("First 5 rows:")
df.show(5)

# Step 7: Drop Null Values
df_clean = df.dropna()

# Step 8: Summary Statistics
print("Summary Statistics:")
df_clean.describe().show()

# Step 9: Value Counts (if categorical column exists)
if 'Category' in df_clean.columns:
    print("Top values in 'Category':")
    df_clean.groupBy("Category").count().orderBy("count", ascending=False).show()

# Step 10: Aggregations on Numerical Columns
numerical_cols = [field.name for field in df_clean.schema.fields if str(field.dataType) in ['IntegerType', 'DoubleType']]

for col_name in numerical_cols:
    print(f"Stats for '{col_name}':")
    df_clean.select(
        count(col(col_name)).alias("count"),
        mean(col(col_name)).alias("mean"),
        spark_min(col(col_name)).alias("min"),
        spark_max(col(col_name)).alias("max")
    ).show()




Saving Life Expectancy Data.csv to Life Expectancy Data (1).csv
Schema:
root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life expectancy : double (nullable = true)
 |-- Adult Mortality: integer (nullable = true)
 |-- infant deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage expenditure: double (nullable = true)
 |-- Hepatitis B: integer (nullable = true)
 |-- Measles : integer (nullable = true)
 |--  BMI : double (nullable = true)
 |-- under-five deaths : integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total expenditure: double (nullable = true)
 |-- Diphtheria : integer (nullable = true)
 |--  HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |--  thinness  1-19 years: double (nullable = true)
 |--  thinness 5-9 years: double (nullable = true)
 |-- Income composition of resources: double (nu