# Import and clean restaurant data

In [4]:
getwd()
setwd("raw-output")

### suppress warnings messages

In [5]:
current_warning <- getOption("warn")
options(warn = -1)

### reset warning to normal

In [None]:
options(warn = current_warning)

### read sample data

In [37]:
sample <- read.csv("restaurants.csv", sep = ";", header = FALSE, nrow=10, quote = "")
head(sample)
sapply(sample, class)

### read in data

In [6]:
# clean up restaurants
restaurants <- read.csv("restaurants.csv", sep = ";", header = FALSE, quote = "", stringsAsFactors=FALSE,
                        col.names = c("restid", "address1", "address2", "address3",
                                      "city", "county", "state", "zip", "lon", "lat",
                                      "open", "tempclose", "reopen", "close"))
head(restaurants)
length(unique(restaurants$restid))

restid,address1,address2,address3,city,county,state,zip,lon,lat,open,tempclose,reopen,close
36955,15311 ELLA BLVD,STOP N GO/NAT'L CONV STORE,,HOUSTON,,TX,77090,0.0,0,1993-05-20,0000-00-00,0000-00-00,1999-12-01
38887,8808 BEACH BLVD,WAL-MART,,JACKSONVILLE,,FL,32216,0.0,0,0000-00-00,0000-00-00,0000-00-00,0000-00-00
49973,11800 ATLANTIC AVE,,,LYNWOOD,,CA,90262,0.0,0,1966-09-15,0000-00-00,0000-00-00,1987-09-15
35069,601 12TH ST NW,,,WASHINGTON,DIST OF COLUMBIA,DC,20005,0.0,0,1991-03-06,0000-00-00,0000-00-00,1997-08-06
41877,5729 WEST FRIENDLY AVENUE,WILCO-GUILFORD COLLEGE,,GUILFORD COLLEGE,,NC,27410,0.0,0,1995-06-19,0000-00-00,0000-00-00,1998-03-26
31941,28050 FORD ROAD,**OFFSET TO BE #017105**,,GARDEN CITY,WAYNE,MI,48135,0.0,0,1969-07-31,0000-00-00,0000-00-00,1995-08-09


### house cleaning

In [7]:
sapply(restaurants, class) # check variables types
restaurants[, c(1, 9:10)] <- lapply(restaurants[, c(1, 9:10)], as.numeric) # convert geo coordinates to numeric
restaurants[(100:120), ] #sanity check

convert_to_date <- function(x) {
    # replace 0000-00-00 dates as NA
    x[x=="0000-00-00"] <- NA
    # convert characters to dates
    x <- as.Date(x)
    return(x)
    print(class(x))
}
restaurants[, 11:14] <- lapply(restaurants[, 11:14], convert_to_date)
rm(convert_to_date)
sapply(restaurants, class) #check classes again
restaurants$address1[restaurants$address1=="N/A"] <- NA
restaurants$city[restaurants$city=="N/A"] <- NA
restaurants$zip[restaurants$zip==""] <- NA
restaurants <- restaurants[order(restaurants$open, restaurants$close), ]

Unnamed: 0,restid,address1,address2,address3,city,county,state,zip,lon,lat,open,tempclose,reopen,close
100,39163,300 CLEVELAND,,,MUSCATINE,MUSCATINE,IA,52761,-91.0295,41.4524,1999-03-03,2010-10-23,2010-11-19,0000-00-00
101,151667,650 COLUMBIA AVENUE,EXXON STATION,RAINBOW GAS GARDEN,CHAPIN,LEXINGTON,SC,29036,-81.3245,34.1761,2009-05-22,0000-00-00,0000-00-00,0000-00-00
102,150379,13350 DALLAS PARKWAY,GALLERIA DALLAS,LEVEL 3 FOOD PLACE,DALLAS,DALLAS,TX,75240,-96.8208,32.9294,2006-10-16,0000-00-00,0000-00-00,2013-05-29
103,109698,3140 FM 1960 W,,,HUMBLE,HARRIS,TX,77338,-95.3769,30.0195,2003-02-19,0000-00-00,0000-00-00,2007-06-01
104,34241,3800 NORTHLAKE BLVD,,,PALM BEACH GARDENS,PALM BEACH,FL,33410,-80.0947,26.8083,1987-11-02,2010-05-18,2010-05-21,0000-00-00
105,118438,1075 W 4TH ST,,,RENO,WASHOE,NV,89503-5103,-119.828,39.5266,2001-11-30,0000-00-00,0000-00-00,0000-00-00
106,150425,1850 MEMORIAL LANE,,,WOOD RIVER,MADISON,IL,62095,-90.0679,38.8669,2006-08-23,0000-00-00,0000-00-00,2012-02-21
107,109744,942 SOUTHERN BLVD,,,BRONX,BRONX,NY,10459,-73.8918,40.821,2000-02-01,0000-00-00,0000-00-00,2008-10-10
108,34287,4152 STATE RT 34,,,HURRICANE,PUTNAM,WV,25526,-81.9381,38.4528,1987-12-16,2012-06-29,2012-07-01,0000-00-00
109,43349,2075 BARNETT SHOALS ROAD,,,ATHENS,CLARKE,GA,30605,-83.3406,33.9233,1998-12-09,0000-00-00,0000-00-00,2014-06-24


In [8]:
restaurants[900:920, c(1:2, 5, 7:14)] #sanity check

Unnamed: 0,restid,address1,city,state,zip,lon,lat,open,tempclose,reopen,close
4910,32019,1529 CACHE ROAD,LAWTON,OK,73501,0.0,0.0,1972-02-25,,,1992-09-08
13537,32009,2900 FAIRVIEW RD,COSTA MESA,CA,92626,-117.908,33.6768,1972-02-25,,,2013-02-28
5407,50081,15418 LASSEN STREET,MISSION HILLS,CA,91340,0.0,0.0,1972-02-26,,,1991-12-25
10829,32004,4130 BOULDER HWY.,LAS VEGAS,NV,89121,0.0,0.0,1972-05-03,,,1992-05-22
3832,44692,139 SOUTHWEST DRIVE,JONESBORO,AR,72401,-90.706,35.8202,1972-05-05,,,
14885,32016,923 S VAN BUREN,ENID,OK,73701,0.0,0.0,1972-05-07,,,1992-06-16
810,32034,913 EDGEBROOK BLVD.,HOUSTON,TX,77034,0.0,0.0,1972-05-26,,,1994-03-16
3889,46532,707 NORTH MINNESOTA,NEW ULM,MN,56073,-94.4662,44.3202,1972-06-01,2011-08-21,2012-05-07,
490,32018,13360 EUREKA ROAD,SOUTHGATE,MI,48195,0.0,0.0,1972-06-16,,,1992-10-18
638,32028,609 S MAIN ST,LOMBARD,IL,60148,0.0,0.0,1972-07-07,,,1990-10-14


In [9]:
# create index for restaurant status
# closed, planned, open
restaurants$status <- NULL
length(restaurants$restid)
restaurants$status[is.na(restaurants$close) & !is.na(restaurants$open)] <- "open"
restaurants$status[!is.na(restaurants$close)] <- "closed"
restaurants$status[is.na(restaurants$close) & is.na(restaurants$open)] <- "planned"
table(restaurants$status)


 closed    open planned 
   9679    6954     347 

In [86]:
#write.csv(x=restaurants, file="restaurants-clean.csv", row.names=FALSE)

In [10]:
head(restaurants)

Unnamed: 0,restid,address1,address2,address3,city,county,state,zip,lon,lat,open,tempclose,reopen,close,status
3647,33220,HAMMOND DRIVE,,,SANDY SPRINGS,,GA,,0,0,1900-01-01,,,1983-04-27,closed
809,33782,5325 MEMORIAL DRIVE,,,STONE MOUNTAIN,,GA,,0,0,1900-01-01,,,1986-02-11,closed
682,33730,ST. RT. 28 & CINEMA DR.,,,MILFORD,,OH,45150,0,0,1900-01-01,,,1986-04-11,closed
1922,33832,2200 CLARK STREET,,,LONG BEACH,,CA,90815,0,0,1900-01-01,,,1986-07-29,closed
654,34420,9TH & ALDER,,,PORTLAND,,OR,,0,0,1900-01-01,,,1986-10-21,closed
7143,34585,536 GOLDEN GATE,,,SAN FRANCISCO,SAN FRANCISCO,CA,94102-3221,0,0,1900-01-01,,,1986-11-19,closed


### clean up longitude and latitude data

In [15]:
summary(restaurants$lon)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
 -159.38   -94.98   -82.50   -59.28     0.00 40213.00        5 

In [16]:
summary(restaurants$lon[restaurants$lon>=0])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
    0.00     0.00     0.00    14.44     0.00 40213.00        5 

In [17]:
hist(restaurants$lon)

ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): unable to start png() device


Plot with title "Histogram of restaurants$lon"