# Pandas Examples

In [None]:
#Input data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1) Accessing DataFrames and Series

In [None]:


# Input table (12 samples x 4 attributes)
X = np.array([[5.1, 3.5, 1, 0.2],
           [4.3, 3. , 1, 0.1],
           [5. , np.nan, 1, 0.4],
           [5.1, 3.4, 2, 0.2],
           [7.0, 3.2, 1, 0.2],
           [6.9, 3.1, 3, 1.5],
           [6.7, 3.1, 1, np.nan],
           [6. , 2.9, 2, 1.5],
           [6.1, 3. , 2, 1.4],
           [6.5, 3. , 3, 2.2],
           [7.7, 3.8, 3, 2.2],
           [7.4, 2.8, 1, 1.9],
           [6.8, 3.2, 1, 2.3]])
# Column names
columns = ['height','width','intensity','weight']

# Class label of each sample
labels = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

**a) Create a DataFrame from a Numpy array (X)**
- Setup column names and the class label
- Add a new composite feature, 'area' = 'width' * 'height'
- Fix missing values with forward fill method

**b) Compute the average area of samples with intensity greater than 1:**

**c) Compute the following probability:**

$$P(label=2 \ | \ height<7)$$   

**d) Compute the following probability. Use the Naive Bayes technique.**
$$P(label=l \ | \ intensity=2 \land height<6.5)$$

**e) Normalize columns with Sklearn standard scaler, fit a random forest**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


## 2) Working with Pandas and spatial data
**a) Load dataframe with Points Of Interest (POIs)**

In [None]:
# This dictionary maps attributes in the table with correct data types
d_types = {'@type':str, '@lat':float, '@lon':float, 'amenity':str, 'name':str, 
           'shop':str, 'public_transport':str, 'highway':str}
# Read "./pois_all_info"
all_pois_df = pd.read_csv("pois_all_info.csv", sep='\t', index_col='@id', dtype=d_types)

**b) Plot POIS on New-York map**

In [None]:
def plot_map(pois_df):
    fig, ax = plt.subplots(figsize=(12,8))
    nyc_img = plt.imread('./New_York_City_Map.PNG')
    ax.imshow(nyc_img, zorder=0, extent=[-74.258, -73.7, 40.49,40.92])
    ax.scatter(??, s=1)
    plt.show()

plot_map(all_pois_df)

**c) Filter only POIS in NY municipality**
- Plot the result on a map

In [None]:
# Read "./data/ny_municipality_pois_id.csv"
ny_pois_ids = pd.read_csv("ny_municipality_pois_id.csv", header=None)[0]

In [None]:
pois_df = ??

In [None]:
plot_map(pois_df)

**d) Count top-frequent public_transport types**

In [None]:
top_freq = ??
top_freq

**e) Plot POIs with the two most frequent public transport types**

In [None]:
## Plot POIs on the map
fig, ax = plt.subplots(figsize=(12,8))
nyc_img = plt.imread('./New_York_City_Map.PNG')
ax.imshow(nyc_img, zorder=0, extent=[-74.258, -73.7, 40.49,40.92])


## TODO:

plt.legend()
plt.show()