In [1]:
import pandas as pd
import numpy as np

# 1️⃣ Sales Data Analysis

In [2]:
data = {
    'Date': pd.date_range('2020-01-01', periods=10, freq='ME'),
    'Product': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
    'Region': ['North', 'South', 'East', 'West', 'North', 'South', 'East', 'West', 'North', 'South'],
    'Sales': np.random.randint(100, 500, 10),
    'Profit': np.random.randint(10, 50, 10)
}

sales = pd.DataFrame(data)

In [3]:
sales

Unnamed: 0,Date,Product,Region,Sales,Profit
0,2020-01-31,A,North,109,49
1,2020-02-29,B,South,400,20
2,2020-03-31,A,East,151,39
3,2020-04-30,C,West,109,42
4,2020-05-31,B,North,103,20
5,2020-06-30,A,South,315,22
6,2020-07-31,C,East,133,38
7,2020-08-31,B,West,239,46
8,2020-09-30,A,North,122,21
9,2020-10-31,C,South,280,16


In [4]:
# Group the data by Region and calculate the total Sales and Profit for each region:
sales.groupby('Region')[['Sales', 'Profit']].sum()

Unnamed: 0_level_0,Sales,Profit
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
East,284,77
North,334,90
South,995,58
West,348,88


In [5]:
# Average sales per product:
sales.groupby('Product')['Sales'].mean()

Product
A    174.250000
B    247.333333
C    174.000000
Name: Sales, dtype: float64

In [6]:
employee_data = {
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Department': ['HR', 'IT', 'HR', 'Finance', 'IT', 'Finance', 'HR'],
    'Performance_Score': [88, 95, 78, 85, 91, 78, 90],
    'Salary': [60000, 70000, 65000, 75000, 72000, 77000, 68000]
}

employees = pd.DataFrame(employee_data)

In [7]:
employees

Unnamed: 0,Employee,Department,Performance_Score,Salary
0,Alice,HR,88,60000
1,Bob,IT,95,70000
2,Charlie,HR,78,65000
3,David,Finance,85,75000
4,Eve,IT,91,72000
5,Frank,Finance,78,77000
6,Grace,HR,90,68000


In [8]:
# Calculate the average performance score for each department.
dept_average = employees.groupby('Department')['Performance_Score'].transform('mean')
dept_average

0    85.333333
1    93.000000
2    85.333333
3    81.500000
4    93.000000
5    81.500000
6    85.333333
Name: Performance_Score, dtype: float64

In [9]:
# Calculate the correlation between Performance Score and Salary.
employees['Performance_Score'].corr(employees['Salary'])

np.float64(-0.16508256192660767)

In [10]:
# Find the employees who have a performance score above the average in their department.
(employees[employees['Performance_Score'] > dept_average])['Employee']

0    Alice
1      Bob
3    David
6    Grace
Name: Employee, dtype: object

# 3️⃣ E-commerce Customer Data

In [11]:
customer_data = {
    'Customer_ID': ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7'],
    'Age': [25, 32, 47, 50, 29, 39, 36],
    'Gender': ['M', 'F', 'F', 'M', 'M', 'F', 'M'],
    'Total_Spend': [1200, 800, 1500, 600, 2000, 2500, 900],
    'Region': ['North', 'South', 'East', 'West', 'North', 'South', 'East']
}

customer = pd.DataFrame(customer_data)

In [12]:
customer

Unnamed: 0,Customer_ID,Age,Gender,Total_Spend,Region
0,C1,25,M,1200,North
1,C2,32,F,800,South
2,C3,47,F,1500,East
3,C4,50,M,600,West
4,C5,29,M,2000,North
5,C6,39,F,2500,South
6,C7,36,M,900,East


In [13]:
# Filter customers who spent more than $1000:
customer[customer['Total_Spend'] > 1000]

Unnamed: 0,Customer_ID,Age,Gender,Total_Spend,Region
0,C1,25,M,1200,North
2,C3,47,F,1500,East
4,C5,29,M,2000,North
5,C6,39,F,2500,South


In [14]:
# Average total spend by region
regional_average = customer.groupby('Region')['Total_Spend'].mean()
regional_average

Region
East     1200.0
North    1600.0
South    1650.0
West      600.0
Name: Total_Spend, dtype: float64

In [15]:
# Most and least spending customers:
highest_spended = customer['Total_Spend'].max()
highest_spender = customer[customer['Total_Spend'] == highest_spended]
highest_spender

Unnamed: 0,Customer_ID,Age,Gender,Total_Spend,Region
5,C6,39,F,2500,South


In [16]:
least_spended = customer['Total_Spend'].min()
least_spender = customer[customer['Total_Spend'] == least_spended]
least_spender

Unnamed: 0,Customer_ID,Age,Gender,Total_Spend,Region
3,C4,50,M,600,West


# 4️⃣ Movie Data Analysis

In [17]:
movie_data = {
    'Movie_Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D', 'Movie E'],
    'Genre': ['Action', 'Comedy', 'Action', 'Drama', 'Comedy'],
    'Release_Year': [2019, 2020, 2021, 2020, 2021],
    'Rating': [7.5, 8.2, 6.9, 7.0, 8.0],
    'Revenue': [500000, 1200000, 800000, 300000, 1500000]
}

movies = pd.DataFrame(movie_data)

In [18]:
movies

Unnamed: 0,Movie_Title,Genre,Release_Year,Rating,Revenue
0,Movie A,Action,2019,7.5,500000
1,Movie B,Comedy,2020,8.2,1200000
2,Movie C,Action,2021,6.9,800000
3,Movie D,Drama,2020,7.0,300000
4,Movie E,Comedy,2021,8.0,1500000


In [19]:
# Average rating by genre:
genre_rating = movies.groupby('Genre')['Rating'].mean()
genre_rating

Genre
Action    7.2
Comedy    8.1
Drama     7.0
Name: Rating, dtype: float64

In [20]:
# Highest-grossing movie:
highest_revenue = movies[movies['Revenue'] == movies['Revenue'].max()]
highest_revenue 

Unnamed: 0,Movie_Title,Genre,Release_Year,Rating,Revenue
4,Movie E,Comedy,2021,8.0,1500000


# 5️⃣ US Stock Market Data

In [21]:
stock_data = {
    'Date': pd.date_range('2021-01-01', periods=5, freq='D'),
    'Company': ['Apple', 'Apple', 'Google', 'Google', 'Apple'],
    'Opening_Price': [135, 138, 1500, 1520, 140],
    'Closing_Price': [140, 142, 1520, 1540, 145],
    'Volume': [100000, 120000, 130000, 140000, 110000],
    'Market_Cap': [2300000000000, 2350000000000, 1600000000000, 1650000000000, 2330000000000]
}

stocks = pd.DataFrame(stock_data)

In [22]:
stocks

Unnamed: 0,Date,Company,Opening_Price,Closing_Price,Volume,Market_Cap
0,2021-01-01,Apple,135,140,100000,2300000000000
1,2021-01-02,Apple,138,142,120000,2350000000000
2,2021-01-03,Google,1500,1520,130000,1600000000000
3,2021-01-04,Google,1520,1540,140000,1650000000000
4,2021-01-05,Apple,140,145,110000,2330000000000


In [23]:
# Calculate the daily percentage change in stock price.


# 6️⃣ World Population Analysis

In [24]:
population_data = {
    'Country': ['China', 'India', 'USA', 'Indonesia', 'Brazil'],
    'Continent': ['Asia', 'Asia', 'North America', 'Asia', 'South America'],
    'Population': [1393409038, 1366417754, 331002651, 273523615, 212559417],
    'GDP': [14342903, 2713082, 21433225, 1056360, 2055505]
}

population = pd.DataFrame(population_data)

In [25]:
population

Unnamed: 0,Country,Continent,Population,GDP
0,China,Asia,1393409038,14342903
1,India,Asia,1366417754,2713082
2,USA,North America,331002651,21433225
3,Indonesia,Asia,273523615,1056360
4,Brazil,South America,212559417,2055505


In [26]:
# Calculate the total population by continent.
continent_population = population.groupby('Continent')['Population'].sum()
continent_population

Continent
Asia             3033350407
North America     331002651
South America     212559417
Name: Population, dtype: int64