In [None]:
import pandas as pd
import numpy as np
# import re
import warnings
warnings.filterwarnings('ignore')

def ascii_table_to_df(table_str: str) -> pd.DataFrame:
    """
    Convert an ASCII/SQL-style table string into a pandas DataFrame.
    Automatically infers numeric and datetime column types.
    """
    rows = []
    for line in table_str.splitlines():
        if "|" in line:
            # Extract all cells between pipes
            cells = [cell.strip() for cell in line.strip().split("|")[1:-1]]
            rows.append(cells)

    # First row is header, rest is data
    columns, data = rows[0], rows[1:]

    # Build DataFrame
    df = pd.DataFrame(data, columns=columns)

    # Try to infer data types
    for col in df.columns:
        # Convert to numeric if possible
        df[col] = pd.to_numeric(df[col], errors="ignore")
        # Convert to datetime if possible
        if df[col].dtype == "object":
            try:
                df[col] = pd.to_datetime(df[col], errors="raise")
            except Exception:
                pass

    return df

# dataset conversion 

In [60]:
# convert dict to dataframe
data_dict = {
  'Category': ['A', 'B', 'A', 'C', 'B'],
  'Value': [10, 15, 20, 25, 30]
}

df = pd.DataFrame(data_dict)
df

Unnamed: 0,Category,Value
0,A,10
1,B,15
2,A,20
3,C,25
4,B,30


In [61]:
# convert dataframe to dict 
df.to_dict(orient='list')

{'Category': ['A', 'B', 'A', 'C', 'B'], 'Value': [10, 15, 20, 25, 30]}

# some basics

# string methods

In [None]:
""" 1683
find the IDs of invalid tweets, where the content of the tweet is greater than 15:
"""
# dataframe.loc[index, columns]
# series.str.len()

df = ascii_table_to_df("""
+----------+-----------------------------------+
| tweet_id | content                           |
+----------+-----------------------------------+
| 1        | Let us Code                       |
| 2        | More than fifteen chars are here! |
+----------+-----------------------------------+
""")

def func(df: pd.DataFrame) -> pd.DataFrame:
  return df.loc[df['content'].str.len()>15, ['tweet_id']]
func(df)

Unnamed: 0,tweet_id
1,2


In [None]:
""" 1873
Write a solution to calculate the bonus of each employee. The bonus of an employee is 100% of their salary if the ID of the employee is an odd number and the employee's name does not start with the character 'M'. The bonus of an employee is 0 otherwise. Return the result table ordered by employee_id.
"""
# series.str.startswith(char)
# dataframe.sort_values(by=column, ascending=True)

df = ascii_table_to_df("""
+-------------+---------+--------+
| employee_id | name    | salary |
+-------------+---------+--------+
| 2           | Meir    | 3000   |
| 3           | Michael | 3800   |
| 7           | Addilyn | 7400   |
| 8           | Juan    | 6100   |
| 9           | Kannon  | 7700   |
+-------------+---------+--------+
""")

def calculate_special_bonus(df: pd.DataFrame) -> pd.DataFrame:
    # create a new column for bonus with default value 0
    df['bonus'] = 0

    # calculate bonus based on the conditions
    df.loc[(df['employee_id']%2 !=0) & (~df['name'].str.startswith('M')), 'bonus'] = df['salary']

    # select only the required columns and sort the result table by employee_id in ascending order
    result_df = df[['employee_id', 'bonus']].sort_values(by='employee_id', ascending=True)

    return result_df
calculate_special_bonus(df)


Unnamed: 0,employee_id,bonus
0,2,0
1,3,0
2,7,7400
3,8,0
4,9,7700


In [None]:
""" 1667
Write a solution to fix the names so that only the first character is uppercase and the rest are lowercase.
Return the result table ordered by user_id.
"""
# series.capitalize(): only make the first letter uppercase
# series.title(): make each word capitalized, e.g. "john doe" → "John Doe" 
# notice, keyword 'by' in sort_values() only works for dataframe; no 'by' keyword for series

df = ascii_table_to_df("""
+---------+-------+
| user_id | name  |
+---------+-------+
| 1       | aLice |
| 2       | bOB   |
+---------+-------+
""")
def func(df: pd.DataFrame) -> pd.DataFrame:
  df['name'] = df['name'].str.capitalize()
  return df.sort_values(by='user_id', ascending=True)
func(df)

Unnamed: 0,user_id,name
0,1,Alice
1,2,Bob


In [58]:
""" 1517
Write a solution to find the users who have valid emails.
A valid e-mail has a prefix name and a domain where:
    The prefix name is a string that may contain letters (upper or lower case), digits, underscore '_', period '.', and/or dash '-'. The prefix name must start with a letter.
    The domain is '@leetcode.com'.
Return the result table in any order.
"""
# regex filtering
# series.str.match(pattern, na=False)

df = ascii_table_to_df("""
+---------+-----------+-------------------------+
| user_id | name      | mail                    |
+---------+-----------+-------------------------+
| 1       | Winston   | winston@leetcode.com    |
| 2       | Jonathan  | jonathanisgreat         |
| 3       | Annabelle | bella-@leetcode.com     |
| 4       | Sally     | sally.come@leetcode.com |
| 5       | Marwan    | quarz#2020@leetcode.com |
| 6       | David     | david69@gmail.com       |
| 7       | Shapiro   | .shapo@leetcode.com     |
+---------+-----------+-------------------------+
""")
def func(df: pd.DataFrame) -> pd.DataFrame:
  pattern = r'^[A-Za-z][A-Za-z0-9_.-]*@leetcode\.com$'
  return df.loc[df['mail'].str.match(pattern, na=False)] 
func(df)

Unnamed: 0,user_id,name,mail
0,1,Winston,winston@leetcode.com
2,3,Annabelle,bella-@leetcode.com
3,4,Sally,sally.come@leetcode.com


In [59]:
""" 1527
Write a solution to find the patient_id, patient_name, and conditions of the patients who have Type I Diabetes. Type I Diabetes always starts with DIAB1 prefix.
Return the result table in any order.
"""


df = ascii_table_to_df("""
+------------+--------------+--------------+
| patient_id | patient_name | conditions   |
+------------+--------------+--------------+
| 1          | Daniel       | YFEV COUGH   |
| 2          | Alice        |              |
| 3          | Bob          | DIAB100 MYOP |
| 4          | George       | ACNE DIAB100 |
| 5          | Alain        | DIAB201      |
+------------+--------------+--------------+
""")
def func(df: pd.DataFrame) -> pd.DataFrame:
  return df
func(df)

Unnamed: 0,patient_id,patient_name,conditions
0,1,Daniel,YFEV COUGH
1,2,Alice,
2,3,Bob,DIAB100 MYOP
3,4,George,ACNE DIAB100
4,5,Alain,DIAB201


# data manipulation

In [None]:
""" 177
Write a solution to find the nth highest distinct salary from the Employee table. If there are less than n distinct salaries, return null.
"""


table = """
+----+--------+
| id | salary |
+----+--------+
| 1  | 100    |
| 2  | 200    |
| 3  | 300    |
+----+--------+
"""
df = ascii_table_to_df(table)
def func(df: pd.DataFrame) -> pd.DataFrame:
  return df
func(df)

Unnamed: 0,id,salary
0,1,100
1,2,200
2,3,300


In [56]:
""" 176
Write a solution to find the second highest distinct salary from the Employee table. If there is no second highest salary, return null (return None in Pandas).
"""


table = """
+----+--------+
| id | salary |
+----+--------+
| 1  | 100    |
| 2  | 200    |
| 3  | 300    |
+----+--------+
"""
df = ascii_table_to_df(table)
def func(df: pd.DataFrame) -> pd.DataFrame:
  return df
func(df)

Unnamed: 0,id,salary
0,1,100
1,2,200
2,3,300


# data aggregation


Pandas data aggregation involves summarizing or combining data within a DataFrame or Series, often after grouping it by one or more columns. This process reduces the dimensionality of the data, providing insights into trends and patterns.


Key Concepts and Methods:

    groupby(): This is the fundamental method for splitting a DataFrame into groups based on the unique values in one or more columns. It returns a DataFrameGroupBy object, which can then be aggregated.

Aggregation Functions:

After grouping, various aggregation functions can be applied to the grouped data to perform calculations like:

    sum(): Computes the sum of values in each group.
    mean(): Calculates the average of values in each group.
    median(): Determines the median of values in each group.
    min(): Finds the minimum value in each group.
    max(): Finds the maximum value in each group.
    count(): Counts the number of non-null values in each group.
    std(): Computes the standard deviation of values in each group.
    var(): Computes the variance of values in each group.
    size(): Computes the size (number of rows) of each group.
    first(), last(), nth(): Retrieve specific values from each group.


In [None]:
df = pd.DataFrame({
  'Category': ['A', 'B', 'A', 'C', 'B'],
  'Value': [10, 15, 20, 25, 30]
})
df

Unnamed: 0,Category,Value
0,A,10
1,B,15
2,A,20
3,C,25
4,B,30


In [64]:
# Group by 'Category' and calculate the sum of 'Value' for each group
df.groupby('Category')['Value'].sum() # this returns a Series, with Category as the index

Category
A    30
B    45
C    25
Name: Value, dtype: int64

In [65]:
df.groupby('Category')['Value'].sum().reset_index() # this return a DataFrame, with Category as a column

Unnamed: 0,Category,Value
0,A,30
1,B,45
2,C,25


aggregate() or agg():
This method provides more flexibility for applying multiple aggregation functions to one or more columns, or different functions to different columns.

## multiple aggregations

In [143]:
df.groupby('Category')['Value'].agg(['sum', 'mean']) # Apply multiple aggregations to a single column


Unnamed: 0_level_0,sum,mean
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,30,15.0
B,45,22.5
C,25,25.0


In [144]:
# alternatively, 
df.groupby('Category').agg(
  sum=('Value', 'sum'),
  mean=('Value', 'mean')
)

Unnamed: 0_level_0,sum,mean
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,30,15.0
B,45,22.5
C,25,25.0


In [145]:
# alternatively, 
df.groupby('Category')['Value'].agg(
  sum='sum',
  mean='mean'
)

Unnamed: 0_level_0,sum,mean
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,30,15.0
B,45,22.5
C,25,25.0


In [66]:
# Apply different aggregations to different columns
df = pd.DataFrame({
  'Category': ['A', 'B', 'A', 'C', 'B'],
  'Value1': [10, 15, 20, 25, 30],
  'Value2': [1, 2, 3, 4, 5]
})
df

Unnamed: 0,Category,Value1,Value2
0,A,10,1
1,B,15,2
2,A,20,3
3,C,25,4
4,B,30,5


In [67]:
df.groupby('Category').agg(
    total_value1=('Value1', 'sum'),
    avg_value2=('Value2', 'mean')
)

Unnamed: 0_level_0,total_value1,avg_value2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,30,2.0
B,45,3.5
C,25,4.0


In [148]:
# User-defined functions can be passed to agg() for more specialized aggregation logic.

In [69]:
df = pd.DataFrame({
  'Category': ['A', 'B', 'A', 'C', 'B'],
  'Value': [10, 15, 20, 25, 30]
})
def custom_range(series):
    return series.max() - series.min()
df.groupby('Category')['Value'].agg(custom_range)

Category
A    10
B    15
C     0
Name: Value, dtype: int64

In [70]:
# alternatively
df.groupby('Category')['Value'].apply(custom_range)

Category
A    10
B    15
C     0
Name: Value, dtype: int64

##### Use .agg() → when your function is truly an aggregation (scalar result per group).

#### Use .apply() → when your function may return non-scalar results or you need more flexibility.

In [155]:
df = pd.DataFrame({
    "Category": ["A","A","B","B","C","C"],
    "Value": [5, 17, 3, 8, 10, 19]
})
df

Unnamed: 0,Category,Value
0,A,5
1,A,17
2,B,3
3,B,8
4,C,10
5,C,19


In [156]:
df.groupby("Category")["Value"].agg(
    range_min="min",
    range_max="max"
)

Unnamed: 0_level_0,range_min,range_max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,17
B,3,8
C,10,19


In [157]:
def custom_range(series):
    return series.max() - series.min()
  
df.groupby("Category")["Value"].agg(
    range_min="min",
    range_max="max",
    range = custom_range
)

Unnamed: 0_level_0,range_min,range_max,range
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,5,17,12
B,3,8,5
C,10,19,9


In [161]:
def custom_range(series):
  return pd.Series({
    'min': series.min(),
    'max': series.max(),
    'range': series.max() - series.min()
  })

df.groupby("Category")["Value"].apply(custom_range).reset_index()

Unnamed: 0,Category,level_1,Value
0,A,min,5
1,A,max,17
2,A,range,12
3,B,min,3
4,B,max,8
5,B,range,5
6,C,min,10
7,C,max,19
8,C,range,9


In [None]:
""" 2356
Write a solution to calculate the number of unique subjects each teacher teaches in the university.
Return the result table in any order.
"""

df = ascii_table_to_df("""
+------------+------------+---------+
| teacher_id | subject_id | dept_id |
+------------+------------+---------+
| 1          | 2          | 3       |
| 1          | 2          | 4       |
| 1          | 3          | 3       |
| 2          | 1          | 1       |
| 2          | 2          | 1       |
| 2          | 3          | 1       |
| 2          | 4          | 1       |
+------------+------------+---------+
""")
def func(df: pd.DataFrame) -> pd.DataFrame:
  return df.groupby('teacher_id')['subject_id'].nunique().reset_index(name='cnt')
func(df)
# this will drop dept_id, because nunique() operates on subject_id not dept_id 
# subject_id and dept_id will have different sizes

Unnamed: 0,teacher_id,cnt
0,1,2
1,2,4


In [None]:
""" 1484
Write a solution to find for each date the number of different products sold and their names.
The sold products names for each date should be sorted lexicographically.
Return the result table ordered by sell_date
"""
activities = ascii_table_to_df("""
+------------+------------+
| sell_date  | product     |
+------------+------------+
| 2020-05-30 | Headphone  |
| 2020-06-01 | Pencil     |
| 2020-06-02 | Mask       |
| 2020-05-30 | Basketball |
| 2020-06-01 | Bible      |
| 2020-06-02 | Mask       |
| 2020-05-30 | T-Shirt    |
+------------+------------+
""")
def categorize_products(activities: pd.DataFrame) -> pd.DataFrame:
  return activities.groupby('sell_date').agg(
    num_sold = ('product', 'nunique'),
    products = ('product', lambda x: ','.join(sorted(x.unique())))
  ).reset_index()
categorize_products(activities)

Unnamed: 0,sell_date,num_sold,products
0,2020-05-30,3,"Basketball,Headphone,T-Shirt"
1,2020-06-01,2,"Bible,Pencil"
2,2020-06-02,1,Mask


In [78]:
"""
For each date_id and make_name, find the number of distinct lead_id's and distinct partner_id's.
Return the result table in any order.
"""
daily_sales = ascii_table_to_df("""
+-----------+-----------+---------+------------+
| date_id   | make_name | lead_id | partner_id |
+-----------+-----------+---------+------------+
| 2020-12-8 | toyota    | 0       | 1          |
| 2020-12-8 | toyota    | 1       | 0          |
| 2020-12-8 | toyota    | 1       | 2          |
| 2020-12-7 | toyota    | 0       | 2          |
| 2020-12-7 | toyota    | 0       | 1          |
| 2020-12-8 | honda     | 1       | 2          |
| 2020-12-8 | honda     | 2       | 1          |
| 2020-12-7 | honda     | 0       | 1          |
| 2020-12-7 | honda     | 1       | 2          |
| 2020-12-7 | honda     | 2       | 1          |
+-----------+-----------+---------+------------+
""")
def daily_leads_and_partners(daily_sales: pd.DataFrame) -> pd.DataFrame:
    return daily_sales.groupby(['date_id','make_name']).agg(
      unique_leads = ('lead_id', 'nunique'),
      unique_partners = ('partner_id', 'nunique')
    ).reset_index()
daily_leads_and_partners(daily_sales)

Unnamed: 0,date_id,make_name,unique_leads,unique_partners
0,2020-12-07,honda,3,2
1,2020-12-07,toyota,1,2
2,2020-12-08,honda,2,2
3,2020-12-08,toyota,2,3


## an employees case

In [100]:
df = pd.read_csv('./employees.csv', parse_dates=['hire_date'])
# alternatively
# df = pd.read_csv('./employees.csv', )
# df['hire_date'] = pd.to_datetime(df['hire_date'])
df

Unnamed: 0,employee_id,department,manager,hire_date,salary,region,sales
0,101,Sales,Alice,2018-05-10,70000,East,150000
1,102,Sales,Alice,2020-07-12,68000,East,120000
2,103,Engineering,Bob,2019-03-05,95000,West,50000
3,104,Engineering,Bob,2021-11-20,87000,West,60000
4,105,HR,Carol,2017-01-15,60000,East,0
5,106,HR,Carol,2019-08-22,62000,West,0
6,107,Sales,Alice,2022-04-01,71000,West,80000
7,108,Engineering,Bob,2018-09-09,98000,East,70000
8,109,Sales,Alice,2019-12-17,69000,West,90000
9,110,HR,Carol,2021-06-05,64000,East,0


In [102]:
df.dtypes

employee_id             int64
department             object
manager                object
hire_date      datetime64[ns]
salary                  int64
region                 object
sales                   int64
dtype: object

In [105]:
"""
count employees by department
"""
df.groupby('department').size().reset_index(name='num_employees')

Unnamed: 0,department,num_employees
0,Engineering,3
1,HR,3
2,Sales,4


In [None]:
"""
average salary per department
"""
df.groupby('department')['salary'].mean().reset_index(name='avg_salary')

Unnamed: 0,department,avg_salary
0,Engineering,93333.333333
1,HR,62000.0
2,Sales,69500.0


In [107]:
"""
total and average sales per region 
"""
df.groupby('region')['sales'].agg(
  total_sales='sum',
  avg_sales='mean'
).reset_index()

Unnamed: 0,region,total_sales,avg_sales
0,East,340000,68000.0
1,West,280000,56000.0


In [119]:
"""
earliest and latest hire data per manager
"""
df.groupby('manager')['hire_date'].agg(
  first_hire='min',
  last_hire='max'
).reset_index()

Unnamed: 0,manager,first_hire,last_hire
0,Alice,2018-05-10,2022-04-01
1,Bob,2018-09-09,2021-11-20
2,Carol,2017-01-15,2021-06-05


In [121]:
"""
distint count of employees per department and region 
"""
df.groupby(['department', 'region'])['employee_id'].nunique().reset_index(name='cnt')

Unnamed: 0,department,region,cnt
0,Engineering,East,1
1,Engineering,West,2
2,HR,East,2
3,HR,West,1
4,Sales,East,2
5,Sales,West,2


In [None]:
"""
for each department, list the top 1 highest paid employees
"""
# .head() works row-wise, not as a group reducer
df.sort_values('salary', ascending=False).groupby('department').head(1)

Unnamed: 0,employee_id,department,manager,hire_date,salary,region,sales
7,108,Engineering,Bob,2018-09-09,98000,East,70000
6,107,Sales,Alice,2022-04-01,71000,West,80000
9,110,HR,Carol,2021-06-05,64000,East,0


In [None]:
"""
for each manager, collect employee IDs as a comma-separated string
"""
# use apply (although using agg has similar effect, as both work as a group reducer in this case)
# employee_id is of type int, use map(str, x) to convert to str first
df.groupby('manager')['employee_id'].apply(
  lambda x: ','.join(sorted(map(str, x)))
).reset_index(name='employee_ids')

Unnamed: 0,manager,employee_ids
0,Alice,101102107109
1,Bob,103104108
2,Carol,105106110


In [139]:
"""
for each region, what % of total company sales does it contribute
"""
region_sales = df.groupby('region')['sales'].sum().reset_index()
region_sales['sale_pct'] = round(region_sales['sales']/region_sales['sales'].sum()*100, 2)
region_sales

Unnamed: 0,region,sales,sale_pct
0,East,340000,54.84
1,West,280000,45.16


In [None]:
"""
monthly average salary trend per department
"""
# group by month: pd.Grouper(key='hire_date', freq='ME')
df.groupby([
  'department',
  pd.Grouper(key='hire_date', freq='ME')
])['salary'].mean().reset_index()

Unnamed: 0,department,hire_date,salary
0,Engineering,2018-09-30,98000.0
1,Engineering,2019-03-31,95000.0
2,Engineering,2021-11-30,87000.0
3,HR,2017-01-31,60000.0
4,HR,2019-08-31,62000.0
5,HR,2021-06-30,64000.0
6,Sales,2018-05-31,70000.0
7,Sales,2019-12-31,69000.0
8,Sales,2020-07-31,68000.0
9,Sales,2022-04-30,71000.0


In [144]:
"""
pivot table of total sales by department and region
"""
pd.pivot_table(
  df, 
  index='department',
  columns = 'region',
  values='sales',
  aggfunc = 'sum',
  fill_value=0
)

region,East,West
department,Unnamed: 1_level_1,Unnamed: 2_level_1
Engineering,70000,110000
HR,0,0
Sales,270000,170000


In [146]:
"""
top 2 employees by sales in each region
"""
df.sort_values('sales', ascending=False).groupby('region').head(2)

Unnamed: 0,employee_id,department,manager,hire_date,salary,region,sales
0,101,Sales,Alice,2018-05-10,70000,East,150000
1,102,Sales,Alice,2020-07-12,68000,East,120000
8,109,Sales,Alice,2019-12-17,69000,West,90000
6,107,Sales,Alice,2022-04-01,71000,West,80000


In [151]:
"""
rolling average of sales per region (last 2 hires)
"""
# rolling over last 2 hires, so it needs to sort accoding to hire_date first
df.sort_values('hire_date').groupby('region')['sales'].rolling(2).mean().reset_index(name='rolling_avg_sale_last_2_hires')

Unnamed: 0,region,level_1,rolling_avg_sale_last_2_hires
0,East,4,
1,East,0,75000.0
2,East,7,110000.0
3,East,1,95000.0
4,East,9,60000.0
5,West,2,
6,West,5,25000.0
7,West,8,45000.0
8,West,3,75000.0
9,West,6,70000.0


In [153]:
"""
percentage of employees in each department relative to company
"""
dept_empl = df.groupby('department')['employee_id'].nunique().reset_index(name='empl_cnt')
dept_empl['dept_empl_pct'] = dept_empl['empl_cnt']/dept_empl['empl_cnt'].sum()*100
dept_empl

Unnamed: 0,department,empl_cnt,dept_empl_pct
0,Engineering,3,30.0
1,HR,3,30.0
2,Sales,4,40.0


# partition after aggregation

In [1]:
import pandas as pd
import numpy as np

# Sample DataFrame
data = {'group': ['A', 'A', 'B', 'B', 'A', 'C', 'C'],
        'value': [10, 15, 20, 25, 12, 30, 35]}
df = pd.DataFrame(data)

# Calculate the mean of 'value' for each 'group' and add it as a new column
df['group_mean'] = df.groupby('group')['value'].transform(np.mean)

print(df)

  group  value  group_mean
0     A     10   12.333333
1     A     15   12.333333
2     B     20   22.500000
3     B     25   22.500000
4     A     12   12.333333
5     C     30   32.500000
6     C     35   32.500000


  df['group_mean'] = df.groupby('group')['value'].transform(np.mean)


In [162]:
table = """
+--------------+-----------------+
| order_number | customer_number |
+--------------+-----------------+
| 1            | 1               |
| 2            | 2               |
| 3            | 3               |
| 4            | 3               |
+--------------+-----------------+
"""
orders = ascii_table_to_df(table)
orders

Unnamed: 0,order_number,customer_number
0,1,1
1,2,2
2,3,3
3,4,3


In [None]:
counts = orders['customer_number'].value_counts()
max_count = counts.max()
counts[counts==max_count].reset_index()[['customer_number']]

Unnamed: 0,customer_number
0,3


Unnamed: 0,customer_number
0,3


In [170]:
orders

Unnamed: 0,order_number,customer_number
0,1,1
1,2,2
2,3,3
3,4,3


In [171]:
orders.to_dict()

{'order_number': {0: 1, 1: 2, 2: 3, 3: 4},
 'customer_number': {0: 1, 1: 2, 2: 3, 3: 3}}

In [172]:
orders.to_dict(orient='list')

{'order_number': [1, 2, 3, 4], 'customer_number': [1, 2, 3, 3]}

In [173]:
orders.to_dict(orient='series')

{'order_number': 0    1
 1    2
 2    3
 3    4
 Name: order_number, dtype: int64,
 'customer_number': 0    1
 1    2
 2    3
 3    3
 Name: customer_number, dtype: int64}

In [174]:
orders.to_dict(orient='records')

[{'order_number': 1, 'customer_number': 1},
 {'order_number': 2, 'customer_number': 2},
 {'order_number': 3, 'customer_number': 3},
 {'order_number': 4, 'customer_number': 3}]

In [176]:
order_dict = {'order_number': [1, 2, 3, 4], 'customer_number': [1, 2, 3, 3]}
order_df = pd.DataFrame(order_dict)

In [177]:
order_df

Unnamed: 0,order_number,customer_number
0,1,1
1,2,2
2,3,3
3,4,3


Unnamed: 0,employee_id,bonus
0,2,0
1,3,0
2,7,7400
3,8,0
4,9,7700
