# Import and clean restaurant data

In [4]:
getwd()
setwd("raw-output")

### suppress warnings messages

In [5]:
current_warning <- getOption("warn")
options(warn = -1)

### reset warning to normal

In [None]:
options(warn = current_warning)

### read sample data

In [37]:
sample <- read.csv("restaurants.csv", sep = ";", header = FALSE, nrow=10, quote = "")
head(sample)
sapply(sample, class)

### read in data

In [6]:
# clean up restaurants
restaurants <- read.csv("restaurants.csv", sep = ";", header = FALSE, quote = "", stringsAsFactors=FALSE,
                        col.names = c("restid", "address1", "address2", "address3",
                                      "city", "county", "state", "zip", "lon", "lat",
                                      "open", "tempclose", "reopen", "close"))
head(restaurants)
length(unique(restaurants$restid))

restid,address1,address2,address3,city,county,state,zip,lon,lat,open,tempclose,reopen,close
36955,15311 ELLA BLVD,STOP N GO/NAT'L CONV STORE,,HOUSTON,,TX,77090,0.0,0,1993-05-20,0000-00-00,0000-00-00,1999-12-01
38887,8808 BEACH BLVD,WAL-MART,,JACKSONVILLE,,FL,32216,0.0,0,0000-00-00,0000-00-00,0000-00-00,0000-00-00
49973,11800 ATLANTIC AVE,,,LYNWOOD,,CA,90262,0.0,0,1966-09-15,0000-00-00,0000-00-00,1987-09-15
35069,601 12TH ST NW,,,WASHINGTON,DIST OF COLUMBIA,DC,20005,0.0,0,1991-03-06,0000-00-00,0000-00-00,1997-08-06
41877,5729 WEST FRIENDLY AVENUE,WILCO-GUILFORD COLLEGE,,GUILFORD COLLEGE,,NC,27410,0.0,0,1995-06-19,0000-00-00,0000-00-00,1998-03-26
31941,28050 FORD ROAD,**OFFSET TO BE #017105**,,GARDEN CITY,WAYNE,MI,48135,0.0,0,1969-07-31,0000-00-00,0000-00-00,1995-08-09


### house cleaning

In [None]:
sapply(restaurants, class) # check variables types
restaurants[, c(1, 9:10)] <- lapply(restaurants[, c(1, 9:10)], as.numeric) # convert geo coordinates to numeric
restaurants[(100:120), ] #sanity check

convert_to_date <- function(x) {
    # replace 0000-00-00 dates as NA
    x[x=="0000-00-00"] <- NA
    # convert characters to dates
    x <- as.Date(x)
    return(x)
    print(class(x))
}
restaurants[, 11:14] <- lapply(restaurants[, 11:14], convert_to_date)
rm(convert_to_date)
sapply(restaurants, class) #check classes again
restaurants$address1[restaurants$address1=="N/A"] <- NA
restaurants$city[restaurants$city=="N/A"] <- NA
restaurants$zip[restaurants$zip==""] <- NA
restaurants <- restaurants[order(restaurants$open, restaurants$close), ]

In [65]:
restaurants[900:920, c(1:2, 5, 7:14)] #sanity check

In [77]:
# create index for restaurant status
# closed, planned, open
restaurants$status <- NULL
length(restaurants$restid)
restaurants$status[is.na(restaurants$close) & !is.na(restaurants$open)] <- "open"
restaurants$status[!is.na(restaurants$close)] <- "closed"
restaurants$status[is.na(restaurants$close) & is.na(restaurants$open)] <- "planned"
table(restaurants$status)


 closed    open planned 
   9679    6954     347 

In [35]:
# clean up address data
colnames(restaurants)[c(2:4, 8)] <- c("street1", "street2", "street3", "longzip")
restaurants$zip[restaurants$zip=="NA" ] <- NA
class(restaurants$zip)
restaurants$zip <- substr(restaurants$longzip, 1, 5)
restaurants$zip <- as.numeric(restaurants$zip)
restaurants$address <- paste(restaurants$street1, restaurants$city, paste(restaurants$state, restaurants$zip), sep=", ")
restaurants$address <- sub(pattern=" NA$", replacement="", restaurants$address)

In [55]:
restaurants <- restaurants[, c("restid", "address", "street1", "street2", "street3", "city", "county",
                              "state", "zip", "lon", "lat", "open", "tempclose", "reopen", "close", "longzip")]

In [90]:
#names(restaurants)
head(restaurants)

Unnamed: 0,restid,address,street1,street2,street3,city,county,state,zip,lon,lat,open,tempclose,reopen,close,longzip,status,tempclose_time
3647,33220,"HAMMOND DRIVE, SANDY SPRINGS, GA",HAMMOND DRIVE,,,SANDY SPRINGS,,GA,,0,0,1900-01-01,,,1983-04-27,,closed,NA days
809,33782,"5325 MEMORIAL DRIVE, STONE MOUNTAIN, GA",5325 MEMORIAL DRIVE,,,STONE MOUNTAIN,,GA,,0,0,1900-01-01,,,1986-02-11,,closed,NA days
682,33730,"ST. RT. 28 & CINEMA DR., MILFORD, OH 45150",ST. RT. 28 & CINEMA DR.,,,MILFORD,,OH,45150.0,0,0,1900-01-01,,,1986-04-11,45150,closed,NA days
1922,33832,"2200 CLARK STREET, LONG BEACH, CA 90815",2200 CLARK STREET,,,LONG BEACH,,CA,90815.0,0,0,1900-01-01,,,1986-07-29,90815,closed,NA days
654,34420,"9TH & ALDER, PORTLAND, OR",9TH & ALDER,,,PORTLAND,,OR,,0,0,1900-01-01,,,1986-10-21,,closed,NA days
7143,34585,"536 GOLDEN GATE, SAN FRANCISCO, CA 94102",536 GOLDEN GATE,,,SAN FRANCISCO,SAN FRANCISCO,CA,94102.0,0,0,1900-01-01,,,1986-11-19,94102-3221,closed,NA days


In [74]:
table(restaurants$state)


       AK   AL   AR   AZ   CA   CO   CT   CU   DC   DE   FL   GA   HI   IA   ID 
  67   45  420  185  435 2254  324  150    1   31   49 1072  601   34   91   67 
  IL   IN   KS   KY   LA   MA   MD   ME   MI   MN   MO   MS   MT   NC   ND   NE 
 629  483  201  308  260  218  232   52  571  215  576  137   37  463   21   97 
  NH   NJ   NM   NV   NY   OH   OK   OR   PA   RI   SC   SD   TN   TX   UT   VA 
  72  361  131  151  543  688  259  258  524   29  238   41  470 1561  119  378 
  VT   WA   WI   WV   WY 
  15  390  292   92   42 

In [63]:
restaurants$state <- trimws(restaurants$state, "both")

In [68]:
restaurants$state[restaurants$state=="DIST OF COLUMBIA"] <- "DC"
restaurants$state[restaurants$state=="LOS ANGELES"] <- "CA"
restaurants$state[restaurants$state=="ALLEGANY"] <- "NY"
restaurants$state[restaurants$state=="'"] <- ""
restaurants$state[restaurants$state=="JEFFERSON"] <- "KY"

In [79]:
table(restaurants$state[restaurants$status=="open"])


     AK  AL  AR  AZ  CA  CO  CT  CU  DC  DE  FL  GA  HI  IA  ID  IL  IN  KS  KY 
 31  16 128 107 178 870 159  48   1   4  16 404 238  31  57  30 257 227  95 142 
 LA  MA  MD  ME  MI  MN  MO  MS  MT  NC  ND  NE  NH  NJ  NM  NV  NY  OH  OK  OR 
129  62  98  17 274  78 206  75  17 235  14  49  19  84  58  76 178 342 104 103 
 PA  RI  SC  SD  TN  TX  UT  VA  VT  WA  WI  WV  WY 
154  16  93  15 216 615  66 186   5 138 131  49  13 

In [87]:
restaurants$tempclose_time <- restaurants$reopen - restaurants$tempclose

In [94]:
head(restaurants[!is.na(restaurants$tempclose_time), ])

Unnamed: 0,restid,address,street1,street2,street3,city,county,state,zip,lon,lat,open,tempclose,reopen,close,longzip,status,tempclose_time
3725,31702,"6104 UNIVERSITY AVE, SAN DIEGO, CA 92115",6104 UNIVERSITY AVE,,,SAN DIEGO,SAN DIEGO,CA,92115,-117.064,32.7531,1965-12-06,2007-12-23,2008-04-12,,92115,open,111 days
1969,31716,"5980 HOLLISTER AVENUE, GOLETA, CA 93117",5980 HOLLISTER AVENUE,,,GOLETA,SANTA BARBARA,CA,93117,-119.83,34.436,1966-05-17,2008-08-12,2008-08-22,2011-11-30,93117,closed,10 days
12733,31730,"9019 N 19TH AVE., PHOENIX, AZ 85021",9019 N 19TH AVE.,,,PHOENIX,MARICOPA,AZ,85021,-112.099,33.5687,1966-06-02,2005-08-20,2006-03-01,,85021-2909,open,193 days
16302,31740,"2450 E INDIAN SCHOOL RD, PHOENIX, AZ 85016",2450 E INDIAN SCHOOL RD,,,PHOENIX,MARICOPA,AZ,85016,-112.029,33.4949,1966-09-09,2001-07-16,2001-09-01,,85016,open,47 days
2181,31759,"1300 W WHITTIER BLVD, LA HABRA, CA 90631",1300 W WHITTIER BLVD,,,LA HABRA,ORANGE,CA,90631,-117.961,33.9391,1967-01-09,2008-02-03,2008-03-19,,90631,open,45 days
2477,31776,"1113 E COLORADO ST, GLENDALE, CA 91205",1113 E COLORADO ST,,,GLENDALE,LOS ANGELES,CA,91205,-118.241,34.1427,1967-07-07,2009-02-02,2009-03-26,,91205,open,52 days


In [102]:
length(restaurants$restid[!is.na(restaurants$tempclose_time)])
summary(as.numeric(restaurants$tempclose_time))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1.00   11.00   51.00   73.17  105.50 1501.00   15037 

In [83]:
write.csv(x=restaurants, file="restaurants-clean.csv", row.names=FALSE)

### clean up longitude and latitude data

In [15]:
summary(restaurants$lon)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
 -159.38   -94.98   -82.50   -59.28     0.00 40213.00        5 

In [16]:
summary(restaurants$lon[restaurants$lon>=0])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
    0.00     0.00     0.00    14.44     0.00 40213.00        5 

In [17]:
hist(restaurants$lon)

ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): unable to start png() device


Plot with title "Histogram of restaurants$lon"