## Data Wrangling with Regular Expressions

***

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', titlesize=9)
plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings('ignore')

import re
import string

pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)

random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


## Exploratory Data Analysis

In [2]:
df = pd.read_csv("raw_bike_sharing_systems.csv")

In [3]:
df

Unnamed: 0,Country,City,Name,System,Operator,Launched,Discontinued,Stations,Bicycles,Daily ridership
0,Albania,Tirana[5],Ecovolis,,,March 2011,,8,200,
1,Argentina,Mendoza[6],Metrobici,,,2014,,2,40,
2,Argentina,"San Lorenzo, Santa Fe",Biciudad,Biciudad,,27 November 2016,,8,80,
3,Argentina,Buenos Aires[7][8],Ecobici,Serttel Brasil[9],Bike In Baires Consortium.[10],2010,,400,4000,21917
4,Argentina,Rosario,Mi Bici Tu Bici[11],,,2 December 2015,,47,480,
...,...,...,...,...,...,...,...,...,...,...
479,United States,"Santa Monica, California[315]",Breeze Bike Share,3 Gen. CycleHop and Social Bicycles,,13 August 2015,,80,500,
480,United States,"Savannah, Georgia[316]",CAT Bike,3 Gen. B-Cycle,,24 January 2014,,2,16,
481,United States,"Seattle, Washington[317]",Pronto Cycle Share,8D,Motivate,13 October 2014,31 March 2017,50,500,
482,United States,"Spartanburg, South Carolina[318]",Spartanburg B-Cycle,3 Gen. B-Cycle,,2011,,5,40,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484 entries, 0 to 483
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          484 non-null    object
 1   City             484 non-null    object
 2   Name             442 non-null    object
 3   System           341 non-null    object
 4   Operator         82 non-null     object
 5   Launched         453 non-null    object
 6   Discontinued     28 non-null     object
 7   Stations         408 non-null    object
 8   Bicycles         408 non-null    object
 9   Daily ridership  23 non-null     object
dtypes: object(10)
memory usage: 37.9+ KB


In [5]:
df.describe()

Unnamed: 0,Country,City,Name,System,Operator,Launched,Discontinued,Stations,Bicycles,Daily ridership
count,484,484,442,341,82,453,28,408,408,23
unique,57,444,325,92,43,248,24,168,189,23
top,Germany,Berlin,nextbike,3 Gen. nextbike,JCDecaux,2010,2016,dockless,500,150000 [100]
freq,63,9,24,61,16,30,3,21,18,1


In [6]:
df.columns

Index(['Country', 'City', 'Name', 'System', 'Operator', 'Launched', 'Discontinued', 'Stations', 'Bicycles', 'Daily ridership'], dtype='object')

In [7]:
df.columns = ['country', 'city', 'name', 'system', 'operator', 'launched', 'discontinued', 'stations', 'bicycles', 'daily_ridership']

In [8]:
df

Unnamed: 0,country,city,name,system,operator,launched,discontinued,stations,bicycles,daily_ridership
0,Albania,Tirana[5],Ecovolis,,,March 2011,,8,200,
1,Argentina,Mendoza[6],Metrobici,,,2014,,2,40,
2,Argentina,"San Lorenzo, Santa Fe",Biciudad,Biciudad,,27 November 2016,,8,80,
3,Argentina,Buenos Aires[7][8],Ecobici,Serttel Brasil[9],Bike In Baires Consortium.[10],2010,,400,4000,21917
4,Argentina,Rosario,Mi Bici Tu Bici[11],,,2 December 2015,,47,480,
...,...,...,...,...,...,...,...,...,...,...
479,United States,"Santa Monica, California[315]",Breeze Bike Share,3 Gen. CycleHop and Social Bicycles,,13 August 2015,,80,500,
480,United States,"Savannah, Georgia[316]",CAT Bike,3 Gen. B-Cycle,,24 January 2014,,2,16,
481,United States,"Seattle, Washington[317]",Pronto Cycle Share,8D,Motivate,13 October 2014,31 March 2017,50,500,
482,United States,"Spartanburg, South Carolina[318]",Spartanburg B-Cycle,3 Gen. B-Cycle,,2011,,5,40,


### Replacing values

In [9]:
df['city'] = df['city'].str.replace(r"[^a-zA-Z ]","")
df['city']

0                          Tirana
1                         Mendoza
2            San Lorenzo Santa Fe
3                    Buenos Aires
4                         Rosario
                  ...            
479       Santa Monica California
480              Savannah Georgia
481            Seattle Washington
482    Spartanburg South Carolina
483                       St Paul
Name: city, Length: 484, dtype: object

In [10]:
df['name'] = df['name'].str.replace(r"[^a-zA-Z ]","")
df['name']

0                 Ecovolis
1                Metrobici
2                 Biciudad
3                  Ecobici
4          Mi Bici Tu Bici
              ...         
479      Breeze Bike Share
480               CAT Bike
481     Pronto Cycle Share
482     Spartanburg BCycle
483    Yellow Bike Project
Name: name, Length: 484, dtype: object

In [11]:
df['system'] = df['system'].str.replace(r"[^a-zA-Z ]","")
df['system']

0                                    NaN
1                                    NaN
2                               Biciudad
3                         Serttel Brasil
4                                    NaN
                     ...                
479     Gen CycleHop and Social Bicycles
480                           Gen BCycle
481                                    D
482                           Gen BCycle
483                       Gen w BikeCard
Name: system, Length: 484, dtype: object

In [12]:
df['operator'] = df['operator'].str.replace(r"[^a-zA-Z ]","")
df['operator']

0                              NaN
1                              NaN
2                              NaN
3        Bike In Baires Consortium
4                              NaN
                  ...             
479                            NaN
480                            NaN
481                       Motivate
482                            NaN
483    volunteers and city council
Name: operator, Length: 484, dtype: object

In [13]:
df['launched'] = df['launched'].str.replace(r"[^a-zA-Z ]","")
df['launched']

0          March 
1                
2       November 
3                
4       December 
          ...    
479       August 
480      January 
481      October 
482              
483              
Name: launched, Length: 484, dtype: object

In [14]:
df['discontinued'] = df['discontinued'].str.replace(r"[^a-zA-Z ]","")
df['discontinued']

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
479        NaN
480        NaN
481     March 
482        NaN
483        NaN
Name: discontinued, Length: 484, dtype: object

In [15]:
df['stations'] = df['stations'].str.replace(r"[^a-zA-Z0-9 ]","")
df['stations']

0        8
1        2
2        8
3      400
4       47
      ... 
479     80
480      2
481     50
482      5
483    NaN
Name: stations, Length: 484, dtype: object

In [16]:
df['bicycles'] = df['bicycles'].str.replace(r"[^a-zA-Z0-9 ]","")
df['bicycles']

0       200
1        40
2        80
3      4000
4       480
       ... 
479     500
480      16
481     500
482      40
483     NaN
Name: bicycles, Length: 484, dtype: object

In [17]:
df['daily_ridership'] = df['daily_ridership'].str.replace(r"[^a-zA-Z0-9 ]","")
df['daily_ridership']

0        NaN
1        NaN
2        NaN
3      21917
4        NaN
       ...  
479      NaN
480      NaN
481      NaN
482      NaN
483      NaN
Name: daily_ridership, Length: 484, dtype: object

In [18]:
df

Unnamed: 0,country,city,name,system,operator,launched,discontinued,stations,bicycles,daily_ridership
0,Albania,Tirana,Ecovolis,,,March,,8,200,
1,Argentina,Mendoza,Metrobici,,,,,2,40,
2,Argentina,San Lorenzo Santa Fe,Biciudad,Biciudad,,November,,8,80,
3,Argentina,Buenos Aires,Ecobici,Serttel Brasil,Bike In Baires Consortium,,,400,4000,21917
4,Argentina,Rosario,Mi Bici Tu Bici,,,December,,47,480,
...,...,...,...,...,...,...,...,...,...,...
479,United States,Santa Monica California,Breeze Bike Share,Gen CycleHop and Social Bicycles,,August,,80,500,
480,United States,Savannah Georgia,CAT Bike,Gen BCycle,,January,,2,16,
481,United States,Seattle Washington,Pronto Cycle Share,D,Motivate,October,March,50,500,
482,United States,Spartanburg South Carolina,Spartanburg BCycle,Gen BCycle,,,,5,40,


### Save to CSV

In [19]:
df.to_csv("bikesharing.csv", index=False)

#### Python code done by Dennis Lam