In [5]:
import pandas as pd
sales_data = pd.read_csv('sales_data.csv')
sales_data.head()

Unnamed: 0,Date,Product,Category,Quantity,Price
0,2023-01-01,Laptop,Electronics,10,800
1,2023-01-01,T-Shirt,Clothing,5,20
2,2023-01-02,Smartphone,Electronics,8,400
3,2023-01-02,Coffee Maker,Home,12,50
4,2023-01-03,Jeans,Clothing,15,30


In [6]:
sales_data.groupby('Category').agg({'Quantity': 'sum'}).reset_index()

Unnamed: 0,Category,Quantity
0,Clothing,157
1,Electronics,183
2,Home,144


In [7]:
sales_data.groupby('Category').agg({'Price': 'mean'}).reset_index()

Unnamed: 0,Category,Price
0,Clothing,31.176471
1,Electronics,276.764706
2,Home,55.0


In [8]:
sales_data.groupby('Category').agg({'Quantity': 'max'}).reset_index()

Unnamed: 0,Category,Quantity
0,Clothing,15
1,Electronics,15
2,Home,14


In [9]:
top_selling_p = sales_data.groupby(['Category', 'Product'])['Quantity'].sum().reset_index()
top_selling_p = top_selling_p.loc[top_selling_p.groupby('Category')['Quantity'].idxmax()]
top_selling_p

Unnamed: 0,Category,Product,Quantity
8,Clothing,Jeans,15
27,Electronics,Smart TV,15
46,Home,Pressure Cooker,14


In [None]:
sales_data['Highest_total_sales'] = sales_data['Quantity'] * sales_data['Price']
date_highest_sales = sales_data.groupby('Date')['Highest_total_sales'].sum().reset_index()
date_highest_sales = date_highest_sales.loc[date_highest_sales['Highest_total_sales'].idxmax()] 
date_highest_sales

Date                   2023-01-07
Highest_total_sales         15150
Name: 6, dtype: object

In [15]:
cust_orders = pd.read_csv('customer_orders.csv')
cust_orders.head()

Unnamed: 0,OrderID,CustomerID,Product,Quantity,Price
0,1,101,Laptop,2,800
1,2,102,Headphones,1,150
2,3,103,Smartphone,3,400
3,4,101,External Hard Drive,2,80
4,5,102,Backpack,1,40


In [27]:
f_customers = cust_orders.groupby('CustomerID').agg({'OrderID': 'count'}).reset_index()
f_customers = f_customers[f_customers['OrderID'] <= 20]
f_customers

Unnamed: 0,CustomerID,OrderID
2,103,20
3,104,20
4,105,18


In [28]:
avg_price = cust_orders.groupby('Product')['Price'].mean().reset_index()
avg_price = avg_price[avg_price['Price'] > 120]
avg_price

Unnamed: 0,Product,Price
16,Drone,300.0
21,Gaming Console,400.0
22,Headphones,150.0
26,Laptop,800.0
27,Point-and-Shoot Camera,250.0
28,Projector,200.0
29,Smart TV,1000.0
30,Smartphone,400.0
38,Tablet,300.0


In [31]:
total_q = cust_orders.groupby('Product').agg({'Quantity': 'sum', 'Price': 'sum'}).reset_index()
total_q = total_q[total_q['Quantity'] < 5]
total_q

Unnamed: 0,Product,Quantity,Price
0,Backpack,4,160
1,Blender,4,180
2,Blu-ray Player,2,140
3,Bluetooth Headset,2,100
4,Bookshelf,2,160
6,Casual Shoes,3,90
7,Coffee Maker,3,150
8,Coffee Table,3,200
9,Comforter Set,3,135
10,Cookware Set,1,60


In [None]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('population.db')
population_df = pd.read_sql('SELECT salary, state FROM population', conn)
conn.close()

sal_bands_df = pd.read_excel('population_salary_analysis.xlsx')

bins = [0, 200000, 400000, 600000, 800000, 1000000, 1200000, 1400000, 1600000, 1800000, float('inf')]
labels = sal_bands_df['Salary Band'].tolist()  

# Categorize salaries
population_df['Salary Category'] = pd.cut(population_df['salary'],
                                          bins=bins,
                                          labels=labels,
                                          include_lowest=True)


total_pop = len(population_df)
grouped = population_df.groupby('Salary Category')

result_overall = pd.DataFrame({
    'Number of population': grouped['salary'].count(),
    'Percentage': (grouped['salary'].count() / total_pop * 100).round(2),
    'Average Salary': grouped['salary'].mean().round(2),
    'Median Salary': grouped['salary'].median().round(2)
})

# Reset index to match Excel structure
result_overall = result_overall.reset_index()

# Calculate measures by state
grouped_by_state = population_df.groupby(['state', 'Salary Category'])
result_by_state = grouped_by_state['salary'].agg(
    Number_of_population='count',
    Average_Salary='mean',
    Median_Salary='median'
).reset_index()

# Calculate percentage within each state
state_totals = result_by_state.groupby('state')['Number_of_population'].transform('sum')
result_by_state['Percentage'] = (result_by_state['Number_of_population'] / state_totals * 100).round(2)

# Reset index
result_by_state = result_by_state.reset_index()

# Update the Excel file
# Drop the NaN columns from the original Excel DataFrame
sal_bands_df = sal_bands_df.drop(columns=['Percentage', 'Average Salary', 'Median Salary', 'Number of population'])
# Merge with calculated results
result_overall = result_overall.drop(columns=['Salary Category'])  # Drop category column for merge
final_df = pd.concat([sal_bands_df, result_overall], axis=1)

# Save to Excel
final_df.to_excel('taskpopulation_salary_analysis_updated.xlsx', index=False)
result_by_state.to_excel('taskpopulation_salary_analysis_by_state.xlsx', index=False)

# Print results
print("Overall Measures:")
print(final_df)
print("\nMeasures by State:")
print(result_by_state)

  grouped = population_df.groupby('Salary Category')
  grouped_by_state = population_df.groupby(['state', 'Salary Category'])


Overall Measures:
               Salary Band  Number of population  Percentage  Average Salary  \
0            till $200,000                  1151        9.88        99283.99   
1      $200,001 - $400,000                  1170       10.04       299558.07   
2      $400,001 - $600,000                  1234       10.59       499163.98   
3      $600,001 - $800,000                  1156        9.92       699680.87   
4    $800,001 - $1,000,000                  1175       10.08       901152.28   
5  $1,000,001 - $1,200,000                  1227       10.53      1098524.07   
6  $1,200,001 - $1,400,000                  1131        9.71      1300684.67   
7  $1,400,001 - $1,600,000                  1132        9.72      1499605.60   
8  $1,600,001 - $1,800,000                  1120        9.61      1698519.17   
9      $1,800,001 and over                  1155        9.91      1902891.84   

   Median Salary  
0        98800.0  
1       299882.0  
2       497925.5  
3       701317.0  
4     