<a href="https://colab.research.google.com/github/farrelrassya/Cluster-Analysis-and-Dimensionality-Reduction/blob/main/02.%20Python%20Libraries/07_Pandas_DataFrames_and_Series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Creating data frames and series


#### Lecture agenda

- Pandas DataFrame
- Pandas Series

In [1]:
import pandas as pd
import numpy as np

#### Pandas DataFrame

In [2]:
# Create pandas dataframe from numpy array

# Create a 2D NumPy array
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Define the column names
columns = ['Column1', 'Column2', 'Column3']

# Define the index names
index = ['Row1', 'Row2', 'Row3']

# Create a DataFrame
df = pd.DataFrame(data, columns=columns, index=index)

df

Unnamed: 0,Column1,Column2,Column3
Row1,1,2,3
Row2,4,5,6
Row3,7,8,9


In [3]:
type(df)

In [4]:
# Dataframe shape
df.shape

(3, 3)

In [5]:
# Dataframe length
len(df)

3

In [6]:
# Create pandas dataframe with no index / column names

# Create a 2D NumPy array
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [7]:
df.index = ['a', 'b', 'c']
df.columns = ['c1' , 'c2', 'c3']
df

Unnamed: 0,c1,c2,c3
a,1,2,3
b,4,5,6
c,7,8,9


In [8]:
# Create dataframe from dict of lists

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 32, 18, 41, 28],
    'City': ['New York', 'Los Angeles', 'London', 'Berlin', 'Sydney']
}

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,18,London
3,David,41,Berlin
4,Eve,28,Sydney


In [9]:
# Create dataframe from list of dictionaries

data = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 32, 'City': 'Los Angeles'},
    {'Name': 'Charlie', 'Age': 18, 'City': 'London'},
    {'Name': 'David', 'Age': 41, 'City': 'Berlin'},
    {'Name': 'Eve', 'Age': 28, 'City': 'Sydney'}
]

df = pd.DataFrame(data)

df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,18,London
3,David,41,Berlin
4,Eve,28,Sydney


In [10]:
# Create data frame from dict of dicts

data = {
    'A': {0: 10, 1: 20, 2: 30, 3: 40, 4: 50, 5: 60},
    'B': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6},
    'C': {0: 'Yes', 1: 'No', 2: 'Cat', 3: 'Dog', 4: 'Rabbit', 5: 'Fish'},
    'D': {0: 'Thur', 1: 'Sun', 2: 'Thur', 3: 'Fri', 4: 'Sun', 5: 'Thur'},
    'E': {0: True, 1: False, 2: False, 3: False, 4: True, 5: False},
    'F': {0: True, 1: False, 2: False, 3: False, 4: True, 5: False},
    'G': {0: 6.7, 1: 2.2, 2: 3.4, 3: 11.1, 4: 12, 5: 22}
}

df = pd.DataFrame(data)

df


Unnamed: 0,A,B,C,D,E,F,G
0,10,1,Yes,Thur,True,True,6.7
1,20,2,No,Sun,False,False,2.2
2,30,3,Cat,Thur,False,False,3.4
3,40,4,Dog,Fri,False,False,11.1
4,50,5,Rabbit,Sun,True,True,12.0
5,60,6,Fish,Thur,False,False,22.0


In [11]:
# Create data frame from list of lists

data = [
    ['Alice', 25, 'New York'],
    ['Bob', 32, 'Los Angeles'],
    ['Charlie', 18, 'London'],
    ['David', 41, 'Berlin'],
    ['Eve', 28, 'Sydney']
]

df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,18,London
3,David,41,Berlin
4,Eve,28,Sydney


In [13]:
# Loading dataframe from a csv file

df = pd.read_csv(
    filepath_or_buffer='mall_customers.csv',
    index_col='CustomerID'
)

In [14]:
df

Unnamed: 0_level_0,Gender,Age,Annual Income (k$),Spending Score (1-100)
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40
...,...,...,...,...
196,Female,35,120,79
197,Female,45,126,28
198,Male,32,126,74
199,Male,32,137,18


In [16]:
# Save dataframe to file

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 32, 18, 41, 28],
    'City': ['New York', 'Los Angeles', 'London', 'Berlin', 'Sydney']
}

df = pd.DataFrame(data)

df.to_csv(
    path_or_buf='df0.csv',
    sep=','
)

In [18]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,18,London
3,David,41,Berlin
4,Eve,28,Sydney


#### Pandas Series

A Pandas Series is like a column in an Excel spreadsheet, it's a list of items where each item can be of any data type (numbers, strings, etc.) and all items have an associated label or 'index'.

In [19]:
# Create from list

se = pd.Series(
    data = [10, 40, 50],
    index= ['Bob', 'Anna', 'Peter'],
    name='series1'
)

print(type(se))
se

<class 'pandas.core.series.Series'>


Bob      10
Anna     40
Peter    50
Name: series1, dtype: int64

In [20]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,18,London
3,David,41,Berlin
4,Eve,28,Sydney


In [21]:
# Create from data frame

se = df['Name']
print(type(se))
se

<class 'pandas.core.series.Series'>


0      Alice
1        Bob
2    Charlie
3      David
4        Eve
Name: Name, dtype: object

In [22]:
# Create from dict

d1 = {'Height': 175, 'Age': 25, 'Weight': 70}

se = pd.Series(d1)

se

Height    175
Age        25
Weight     70
dtype: int64

In [23]:
# Create pandas data frame from series

df = pd.DataFrame(se)

df

Unnamed: 0,0
Height,175
Age,25
Weight,70
