In [1]:
import pandas as pd

Create DataFrame from List

In [2]:
student_data = [[1, 15], [2, 11], [3, 11], [4, 20]]

In [3]:
df = pd.DataFrame(student_data, columns=['Student Number', 'Age'])
print(df)

   Student Number  Age
0               1   15
1               2   11
2               3   11
3               4   20


Get the Size of a DataFrame (List)

In [4]:
df.shape

(4, 2)

Get the First 3 Rows

In [5]:
df.head(3)

Unnamed: 0,Student Number,Age
0,1,15
1,2,11
2,3,11


Select Data

In [6]:
students = [[101, "Ulysses", 13], [102, "Penelope", 14], [103, "Telemachus", 15], [104, "Calypso", 16]]

df = pd.DataFrame(students, columns=['Student ID', 'Name', 'Age'])

print(df)

   Student ID        Name  Age
0         101     Ulysses   13
1         102    Penelope   14
2         103  Telemachus   15
3         104     Calypso   16


In [7]:
df.loc[df['Student ID'] == 101, ['Name', 'Age']]

Unnamed: 0,Name,Age
0,Ulysses,13


Create a New Column

In [8]:
employees = [["Piper", 4548], ["Alex", 2587], ["Suzanne", 3547], ["Nicky", 5874]]

df = pd.DataFrame(employees, columns=['Name', 'Salary'])

print(df)

      Name  Salary
0    Piper    4548
1     Alex    2587
2  Suzanne    3547
3    Nicky    5874


In [9]:
df['Bonus'] = df['Salary'] * 2
print(df)

      Name  Salary  Bonus
0    Piper    4548   9096
1     Alex    2587   5174
2  Suzanne    3547   7094
3    Nicky    5874  11748


Drop duplicate rows and keep only the first occurrence

In [10]:
employees = [[1, "Ella", "emily@example.com"], [2, "David", "michael@example.com"], [3, "Zacahry", "sarah@example.com"], [4, "Alice", "john@example.com "], [5, "Finn", "john@example.com "], [6, "Joe", "john@example.com "], [7, "Violet", "alice@example.com "]]

df = pd.DataFrame(employees, columns=['customer_id', 'Name', 'Email']) 

print(df)

   customer_id     Name                Email
0            1     Ella    emily@example.com
1            2    David  michael@example.com
2            3  Zacahry    sarah@example.com
3            4    Alice    john@example.com 
4            5     Finn    john@example.com 
5            6      Joe    john@example.com 
6            7   Violet   alice@example.com 


In [11]:
# subset='Email': This means that we are considering duplicates based on the email column only.
# keep='first': This indicates that we want to keep the first occurrence of any duplicated email and drop the subsequent occurrences.
# inplace=True: This means the changes will be made directly to the passed DataFrame (customers) without returning a new one.

df.drop_duplicates(subset='Email', keep='first', inplace=True)

print(df)

   customer_id     Name                Email
0            1     Ella    emily@example.com
1            2    David  michael@example.com
2            3  Zacahry    sarah@example.com
3            4    Alice    john@example.com 
6            7   Violet   alice@example.com 


Drop Missing Data

In [12]:
df = pd.DataFrame(
{"student_id" : [32, 33, 34],
"name" : [None, "John", "Doe"],
"age" : [10, 11, 12]})

In [13]:
# print(df.dropna())

# DO NOT NEST
df.dropna(subset=['name'], inplace=True)

print(df)

   student_id  name  age
1          33  John   11
2          34   Doe   12


Modify Columns

In [14]:
df = pd.DataFrame(
{"name" : ["Jack", "Piper", "Alex", "Suzanne", "Nicky"],
"salary" : [19666, 25478, 25874, 35478, 58745]})

In [15]:
df["salary"] = df["salary"] * 2

print(df)

      name  salary
0     Jack   39332
1    Piper   50956
2     Alex   51748
3  Suzanne   70956
4    Nicky  117490


Rename Columns

In [16]:
df = pd.DataFrame(
{"id" : [1, 2, 3, 4, 5],
"first" : ["Jack", "Piper", "Alex", "Suzanne", "Nicky"],
"last" : ["Smith", "Brown", "Johnson", "Williams", "Jones"],
"age" : [6, 7, 8, 9, 10]})

In [17]:
df.rename(columns = {'id':'student_id', 'first':'first_name', 'last':'last_name', 'age':'age_in_years'})
print(df)

   id    first      last  age
0   1     Jack     Smith    6
1   2    Piper     Brown    7
2   3     Alex   Johnson    8
3   4  Suzanne  Williams    9
4   5    Nicky     Jones   10


Change Data Type

In [18]:
df = pd.DataFrame(
{"student_id" : [1, 2],
"name" : ["Jack", "Piper"],
"age" : [6, 15],
"grade" : [73.0, 85.0]})

In [19]:
df["grade"] = df["grade"].astype(int)

print(df)

   student_id   name  age  grade
0           1   Jack    6     73
1           2  Piper   15     85


Fill Missing Data

In [20]:
df = pd.DataFrame(
{"name" : ["Jack", "Piper", "Alex", "Suzanne", "Nicky"],
"quantity" : [None, None, 779, 1000, 2000],
"price" : [6, null, 8, 9, 10]})

NameError: name 'null' is not defined

In [None]:
df["quantity"] = df["quantity"].fillna(0, inplace=True)

print(df)

      name quantity  price
0     Jack     None      6
1    Piper     None      7
2     Alex     None      8
3  Suzanne     None      9
4    Nicky     None     10


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["quantity"] = df["quantity"].fillna(0, inplace=True)


Reshape Data: Concatenate

In [30]:
df1 = pd.DataFrame(
{"student_id" : [1, 2, 3, 4],
"name" : ["Jack", "Piper", "Alex", "Suzanne"],
"age" : [6, 7, 8, 9]})

df2 = pd.DataFrame(
{"student_id" : [1, 2],
"name" : ["Leo", "Alex"],
"age" : [6, 15]})

In [31]:
df = pd.concat([df1,df2])

print(df)

   student_id     name  age
0           1     Jack    6
1           2    Piper    7
2           3     Alex    8
3           4  Suzanne    9
0           1      Leo    6
1           2     Alex   15


Reshape Data: Pivot (Convert to Wide Format)

In [23]:
df = pd.DataFrame(
{"city" : ["Jacksonville", "Jacksonville", "Jacksonville", "Jacksonville", "Jacksonville", "El Paso", "El Paso", "El Paso", "El Paso", "El Paso"],
"month" : ["January", "February", "March", "April", "May", "January", "February", "March", "April", "May"],
"temperature" : [13, 23, 38, 5, 34, 20, 6, 26, 2, 43]})

In [24]:
df_wide = df.pivot(index='month', columns='city', values='temperature')

print(df_wide)

city      El Paso  Jacksonville
month                          
April           2             5
February        6            23
January        20            13
March          26            38
May            43            34


Reshape Data: Melt (Convert to Long Format)

In [27]:
df = pd.DataFrame(
{"product" : ["Umbrella", "Sleeping Bag"],
"quarter_1" : [417, 800],
"quarter_2" : [224, 936],
"quarter_3" : [379, 93],
"quarter_4" : [611, 875]})

In [29]:
df_long = df.melt(id_vars='product', 
                  value_vars=['quarter_1', 'quarter_2', 'quarter_3', 'quarter_4'], 
                  var_name='quarter', 
                  value_name='sales')

print(df_long)

        product    quarter  sales
0      Umbrella  quarter_1    417
1  Sleeping Bag  quarter_1    800
2      Umbrella  quarter_2    224
3  Sleeping Bag  quarter_2    936
4      Umbrella  quarter_3    379
5  Sleeping Bag  quarter_3     93
6      Umbrella  quarter_4    611
7  Sleeping Bag  quarter_4    875


In [None]:
df = pd.Series([2.6, 3, 4.9, 5, 5, 6, 6, 7.9, 8, 8.2])

df.std(ddof=0).round(decimals=1)

1.9

In [None]:
df.median()

5.5

In [None]:
df.mean()

5.659999999999999

In [None]:
df.quantile([0.25, 0.5, 0.75])

0.25    4.925
0.50    5.500
0.75    7.425
dtype: float64

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
IQR

2.5

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

lower_bound, upper_bound

(1.1750000000000007, 11.175)

In [None]:
df = pd.DataFrame({"values": [5, 6, 5, 6, 8]})

mad_mean = df["values"].sub(df["values"].mean()).abs().mean().round(2)

print("MAD (Mean-Based):", mad_mean)

MAD (Mean-Based): 0.8


In [None]:
t1 = 0.13350
t2 = 0.98679

print(t2-t1)

0.8532899999999999


In [None]:
df = pd.DataFrame({"x": [60], "y_actual": [78]})

# Compute predicted values using the equation ŷ = -47 + 2x
df["y_predicted"] = 2 + 7/6 * df["x"]

# Compute residuals: Residual = Actual - Predicted
df["residual"] = df["y_actual"] - df["y_predicted"]

print(df)

    x  y_actual  y_predicted  residual
0  60        78         72.0       6.0


In [None]:
# Given values
x_mean = 75.8
y_mean = 696
sx = 14.8
sy = 177.6
r = 0.81

# Compute slope (b1)
b1 = r * (sy / sx)

# Compute intercept (b0)
b0 = y_mean - b1 * x_mean

# Round to two decimal places
b0, b1 = round(b0, 3), round(b1, 4)

print(f"ŷ = {b0} + {b1}x")

ŷ = -40.776 + 9.72x


In [1]:
# online_orders["date_sold"] = pd.to_datetime(online_orders["date_sold"], errors='coerce')

# online_orders["revenue"] = online_orders["cost_in_dollars"] * online_orders["units_sold"]

# first_half = online_orders[online_orders["date_sold"].between("2022-01-01", "2022-06-30")].groupby(by="product_id", as_index=False).sum().sort_values(by=["revenue"], ascending=False).head(5)[["product_id", "revenue"]]

# first_half

# link: https://platform.stratascratch.com/coding/2119-most-lucrative-products?code_type=2

In [2]:
# median = sat_scores["sat_writing"].median()

# students = sat_scores[sat_scores["sat_writing"] == median]["student_id"]

# students

# link: https://platform.stratascratch.com/coding/9610-find-students-with-a-median-writing-score?code_type=2

In [3]:
# workers = pd.merge(worker, title, how='inner', left_on='worker_id', right_on='worker_ref_id')

# workers = workers[workers["salary"] == workers["salary"].max()]["worker_title"]

# workers

# link: https://platform.stratascratch.com/coding/10353-workers-with-the-highest-salaries?code_type=2