# Pandas basics

## Create a DataFrame

In [24]:
import pandas as pd

df = pd.DataFrame({
    "name":['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    "age": [25, 30, 35, 40, 45],
    "city": ['Berlin', 'München', 'Köln', 'Hamburg', 'Stuttgart']
})
df

Unnamed: 0,name,age,city
0,Alice,25,Berlin
1,Bob,30,München
2,Charlie,35,Köln
3,David,40,Hamburg
4,Eva,45,Stuttgart


## Filter DataFrame

In [22]:
df_filtered = df[df["age"] > 30]
df_filtered

Unnamed: 0,name,age,city
2,Charlie,35,Köln
3,David,40,Hamburg
4,Eva,45,Stuttgart


## Merge DataFrames

In [3]:
import pandas as pd

# Beispiel: Zwei DataFrames zusammenführen
data1 = {'name': ['Anna', 'Ben', 'Chris'], 'age': [23, 30, 35]}
data2 = {'name': ['Anna', 'Ben', 'Chris'], 'city': ['Berlin', 'Hamburg', 'München']}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Zusammenführen der beiden DataFrames auf der Spalte 'Name'
df_merged = pd.merge(df1, df2, on='name')
df_merged

Unnamed: 0,name,age,city
0,Anna,23,Berlin
1,Ben,30,Hamburg
2,Chris,35,München


## Melt DataFrames

In [6]:
df_scores = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'math': [90, 80, 85],
    'english': [85, 95, 80]
})

df_melted = pd.melt(df_scores, id_vars=['name'], value_vars=['math', 'english'], var_name='subject', value_name='score')
print(df_melted)

      name  subject  score
0    Alice     math     90
1      Bob     math     80
2  Charlie     math     85
3    Alice  english     85
4      Bob  english     95
5  Charlie  english     80


## Concat DataFrames

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

In [13]:
df1 = pd.DataFrame({"month": [5,5,5], "sales": [300, 500, 1000]})
df2 = pd.DataFrame({"month": [6,6,6], "sales": [200, 450, 1230]})

df3 = pd.concat([df1, df2])
df3

Unnamed: 0,month,sales
0,5,300
1,5,500
2,5,1000
0,6,200
1,6,450
2,6,1230


In [14]:
df1 = pd.DataFrame({"month": [5,5,5], "sales": [300, 500, 1000]})
df2 = pd.DataFrame({"month": [6,6,6], "sales": [200, 450, 1230]})

df3 = pd.concat([df1, df2], ignore_index=True)
df3

Unnamed: 0,month,sales
0,5,300
1,5,500
2,5,1000
3,6,200
4,6,450
5,6,1230


## Pivot

In [16]:
data_pivot = {
    'name': ['Anna', 'Ben', 'Chris', 'Anna', 'Ben', 'Chris'],
    'year': [2020, 2020, 2020, 2021, 2021, 2021],
    'score': [85, 90, 75, 88, 93, 77]
}
df_pivot = pd.DataFrame(data_pivot)
df_pivot.pivot(index='name', columns='year', values='score')

year,2020,2021
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anna,85,88
Ben,90,93
Chris,75,77


## Get a column of a DataFrame as Series

In [18]:
df = pd.DataFrame({
    "name":['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    "age": [25, 30, 35, 40, 45],
    "city": ['Berlin', 'München', 'Köln', 'Hamburg', 'Stuttgart']
})

name = df.name
name, type(name)

(0      Alice
 1        Bob
 2    Charlie
 3      David
 4        Eva
 Name: name, dtype: object,
 pandas.core.series.Series)

## Get a row of a DataFrame as Series

In [19]:
df = pd.DataFrame({
    "name":['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    "age": [25, 30, 35, 40, 45],
    "city": ['Berlin', 'München', 'Köln', 'Hamburg', 'Stuttgart']
})

row0 = df.iloc[0]
row0, type(row0)

(name     Alice
 age         25
 city    Berlin
 Name: 0, dtype: object,
 pandas.core.series.Series)

## Fill NaN values
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html

In [44]:
import pandas as pd
import numpy as np

data = {
    "numbers": [1, 2, 3, 4, 5, np.nan, 6, 7, np.nan, np.nan, 8, 9, 10, np.nan]
}

df1 = pd.DataFrame(data)
df2 = df1.fillna(df2.mean())  # fill NaN with average(mean) value of columns
df2

Unnamed: 0,numbers
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,5.5
6,6.0
7,7.0
8,5.5
9,5.5


## Fill NaN values with custom function

In [57]:
import pandas as pd
import numpy as np

data = {
    "name": ["Peter", "Paul", "Michael", "Tanja", "Erna", "Sabine"],
    "age": [53, 27, np.nan, 42, 80, np.nan],
    "sex": ["m", "m", "m", "f", "f", "f"]
}

df = pd.DataFrame(data)
df['age'] = df['age'].fillna(df.loc[df['sex'] == 'f', 'age'].mean())  # all NaN values are filled with female mean age
df

Unnamed: 0,name,age,sex
0,Peter,53.0,m
1,Paul,27.0,m
2,Michael,61.0,m
3,Tanja,42.0,f
4,Erna,80.0,f
5,Sabine,61.0,f


## Fill NaN values with condition

In [62]:
import pandas as pd
import numpy as np

data = {
    "name": ["Peter", "Paul", "Michael", "Tanja", "Erna", "Sabine"],
    "age": [53, 27, np.nan, 42, 80, np.nan],
    "sex": ["m", "m", "m", "f", "f", "f"]
}

df = pd.DataFrame(data)

means = df.groupby('sex')['age'].transform('mean')
df['age'] = df['age'].fillna(means)

#df2 = df.groupby('sex')['age'].mean()
#df['age'].fillna(df['sex'].map(df2), inplace=True)

df

Unnamed: 0,name,age,sex
0,Peter,53.0,m
1,Paul,27.0,m
2,Michael,40.0,m
3,Tanja,42.0,f
4,Erna,80.0,f
5,Sabine,61.0,f
