In [3]:
import matplotlib.pyplot as plt
import pandas as pd

## Histograms

In [None]:
dog_pack = pd.DataFrame()

In [None]:
# create a histogram
dog_pack['height_cm'].hist()

# call graph to show
plt.show()

In [None]:
## Changing the number of bins
dog_pack['height_cm'].hist(bins=20)
plt.show()

dog_pack['height_cm'].hist(bins=5)
plt.show()

In [None]:
# Bar plots
# can show values between a categorical variable and a numerical variable
avg_weight_by_breed = dog_pack.groupby('breed')['weight_kg'].mean()
print(avg_weight_by_breed)

In [None]:
avg_weight_by_breed.plot(kind='bar')
plt.show()

In [None]:
# adding a title
avg_weight_by_breed.plot(kind='bar', title='Mean Weight by Dog Breed')
plt.show()

In [None]:
# Line Plot
# good for showing numerical changes over time
sully = dog_pack[dog_pack['name'] == 'sully']
plt.show()

In [None]:
# Rotating axis labels
sully.plot(x='date',
           y='weight_kg',
           kind='line',
           rot=45)

In [None]:
# Scatter plots
# good for visualising the relationship between two numerical variables
dog_pack.plot(x='height_cm',
              y='weight_kg',
              kind='scatter')
plt.show()

In [None]:
## layering plots
dog_pack[dog_pack['sex'] == 'F']["height_cm"].hist()
dog_pack[dog_pack['sex'] == 'M']["height_cm"].hist()
plt.show()

In [None]:
# adding a legend
dog_pack[dog_pack['sex'] == 'F']["height_cm"].hist()
dog_pack[dog_pack['sex'] == 'M']["height_cm"].hist()
plt.legend(['F', 'M'])
plt.show()

In [None]:
# transparancy
dog_pack[dog_pack['sex'] == 'F']["height_cm"].hist(alpha=0.7)
dog_pack[dog_pack['sex'] == 'M']["height_cm"].hist(alpha=0.7)
plt.legend(['F', 'M'])
plt.show()

## Missing Values

In [None]:
dogs = pd.DataFrame()

In [None]:
# In pandas missing df values are represented with NaN (Not a Number)
# Detecting missing values
dogs.isna() # retruns a boolean value if value is missing -> not very helpful

# Overview method of the data if any are missing
dogs.isna().any()

In [None]:
# counting missings values
# counting sum of booleans is the same as counting the numbers of true
dogs.isna().sum()

In [None]:
# Plotting misssing values
dogs.isna().sum().plot(kind='bar')
plt.show()

In [None]:
# What to do with missing values
# option 1: remove them - not ideal if there is a lot of missing data
dogs.dropna()

In [None]:
# option 2: replacing missing values
dogs.fillna(0)

## Creating DataFrames

In [None]:
# Dictionaries - Key: value pairs
value1, value2, value3 = 0
my_dict = {
    'key1': value1,
    'key2': value2,
    'key3': value3
}

my_dict = {
    'title': "Charlotte's Web",
    'author': "E.B. White",
    'published': 1952
}

In [None]:
# access dict
my_dict['title']

In [4]:
# From a list of dictionaries - constructed row by row
list_of_dict = [
    {
        "name": "Ginger",
        "breed": "Dachshund",
        "height_cm": 22,
        "weight_kg": 10,
        "date_of_birth": "2019-03-14"
    },
    {
        "name": "Scout",
        "breed": "Dalmation",
        "height_cm": 59,
        "weight_kg": 25,
        "date_of_birth": "2019-05-09"
    }
]

new_dogs = pd.DataFrame(list_of_dict)

# from a dictionary of lists - constructed column by column
# Key = column name
# Value = list of column values
dict_of_lists = {
    "name": ["Ginger", "Scout"],
    "breed": ["Dachshund", "Dalmation"],
    "height_cm": [22, 59],
    "weight_kg": [10, 25],
    "date_of_birth": ["2019-03-14", "2019-05-09"]
}

new_dogs = pd.DataFrame(dict_of_lists)

In [5]:
print(new_dogs)

     name      breed  height_cm  weight_kg date_of_birth
0  Ginger  Dachshund         22         10    2019-03-14
1   Scout  Dalmation         59         25    2019-05-09


## Reading and Writing CSVs

In [None]:
new_dogs = pd.read_csv("new_dogs.csv")

In [None]:
# saving to csv
new_dogs.to_csv("new_dogs_with_bmi")

In [None]:
# Look into:
# Joining Data with pandas
# Streamlined Data Ingestion with pandas