<h1>Feature Engineering</h1>

<h4>Importing dependencies</h4>

In [10]:
import pandas as pd
import numpy as np

<h4>Analyzing each City</h4>

In [2]:
df = pd.read_csv('europe_venues.csv', index_col=0)

In [5]:
# one hot encoding
europe_onehot = pd.get_dummies(df[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
europe_onehot['City'] = df['City'] 

# move neighborhood column to the first column
fixed_columns = [europe_onehot.columns[-1]] + list(europe_onehot.columns[:-1])
europe_onehot = europe_onehot[fixed_columns]

europe_onehot.sample(10)

Unnamed: 0,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,...,West-Ukrainian Restaurant,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
12248,Toulouse,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42573,Nijmegen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16070,Kaluga,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85,Moscow,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9275,Tyumen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6680,Ulyanovsk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,Kharkiv,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13869,Makiyivka,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4622,Voronezh,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10775,Gomel,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h4>Grouping rows by city and by taking the mean of the frequency of occurrence of each category</h4>

In [6]:
europe_grouped = europe_onehot.groupby('City').mean().reset_index()
europe_grouped.sample(10)

Unnamed: 0,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,...,West-Ukrainian Restaurant,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
173,Iasi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289,Neue Neustadt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
181,Kaliningrad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
439,Tomsk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
97,City of Westminster,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0
390,Sector 5,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,Oviedo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.01,0.0
62,Bratislava,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0
206,Korolyov,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
414,Stuttgart,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
europe_grouped.shape

(499, 562)

In [8]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

<h4>Create the new dataframe with top 10 venues for each city</h4>

In [13]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cities_venues_sorted = pd.DataFrame(columns=columns)
cities_venues_sorted['City'] = europe_grouped['City']

for ind in np.arange(europe_grouped.shape[0]):
    cities_venues_sorted.iloc[ind, 1:] = return_most_common_venues(europe_grouped.iloc[ind, :], num_top_venues)

cities_venues_sorted.sample(10)

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
93,Chernihiv,Coffee Shop,Park,Italian Restaurant,Supermarket,Gym / Fitness Center,Historic Site,Pub,Pizza Place,Hotel,Church
94,Chernivtsi,Café,Coffee Shop,Gym,Restaurant,Plaza,Gym / Fitness Center,Park,Fast Food Restaurant,Pizza Place,Hotel
2,Abakan,Park,Café,Mobile Phone Shop,Coffee Shop,Burger Joint,Theme Park,Sushi Restaurant,Pizza Place,Hotel,Grocery Store
214,Krefeld,Supermarket,Café,Ice Cream Shop,Italian Restaurant,Drugstore,German Restaurant,Greek Restaurant,Spanish Restaurant,Chinese Restaurant,Gym
24,Augsburg,Italian Restaurant,Café,Beer Garden,Steakhouse,German Restaurant,Bar,Turkish Restaurant,Bakery,Hotel,Burger Joint
331,Pecs,Coffee Shop,Bar,Café,Restaurant,Plaza,Scenic Lookout,Fast Food Restaurant,Bakery,Park,Beer Garden
288,Nazran',Café,Eastern European Restaurant,Monument / Landmark,Hotel,Train Station,Farm,Farmers Market,Fast Food Restaurant,Falafel Restaurant,Fishing Spot
151,Goeteborg,Coffee Shop,Hotel,Burger Joint,Café,Scandinavian Restaurant,Pub,Theme Park Ride / Attraction,Wine Bar,Italian Restaurant,Seafood Restaurant
418,Sutton,Pub,Park,Coffee Shop,Café,Supermarket,Sushi Restaurant,Korean Restaurant,Italian Restaurant,Tennis Stadium,Gastropub
434,Tilburg,Bar,Café,Theme Park Ride / Attraction,Restaurant,Park,Supermarket,Zoo Exhibit,Coffee Shop,Electronics Store,Shopping Mall
