# Working with CSV files

In [1]:
# import necessary modules
import pandas as pd, numpy as np

The read_csv() function in pandas allows us to easily import our data. By default, it assumes the data is comma-delimited. However, you can specify the delimiter used in your data (e.g., tab, semicolon, pipe, etc.). There are several parameters that you can specify. See the documentation here.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [17]:
phone_data = pd.read_csv(r"C:\Users\itspark\Documents\Analytics\dataset/cars1.csv") #add your data path here
phone_data.head()

Unnamed: 0.1,Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,,,,,
1,1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,,,,,
2,2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,,,,,
3,3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,,,,,
4,4,17.0,8,302,140,3449,10.5,70,1,ford torino,,,,,


In [22]:
phone_data.to_csv("aaa.txt")

In [11]:
# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file
df2 = pd.read_csv(r'C:\Users\itspark\Documents\Analytics\dataset/testt.csv')
df2

Unnamed: 0,city,state
0,san francisco,california
1,phoenix,arizona
2,seattle,washington
3,dallas,texas
4,denver,colorado
5,chicago,illinois
6,portland,oregon
7,miami,florida


In [3]:
# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods
df2.head()

Unnamed: 0,city,state
0,san francisco,california
1,phoenix,arizona
2,seattle,washington
3,dallas,texas
4,denver,colorado


In [4]:
# returns 5 rows by default, or you can pass the number of rows you want as an argument
df2.tail(3)

Unnamed: 0,city,state
5,chicago,illinois
6,portland,oregon
7,miami,florida


In [12]:
# you can add a new column to a DataFrame
df2['country'] = ''
df2

Unnamed: 0,city,state,country
0,san francisco,california,
1,phoenix,arizona,
2,seattle,washington,
3,dallas,texas,
4,denver,colorado,
5,chicago,illinois,
6,portland,oregon,
7,miami,florida,


In [13]:
df2.dtypes

city       object
state      object
country    object
dtype: object

In [14]:
# you can update the values of an entire column
df2['country'] = 'USA'
df2

Unnamed: 0,city,state,country
0,san francisco,california,USA
1,phoenix,arizona,USA
2,seattle,washington,USA
3,dallas,texas,USA
4,denver,colorado,USA
5,chicago,illinois,USA
6,portland,oregon,USA
7,miami,florida,USA


In [15]:
# you can set the values of a column (aka, Series) in the DataFrame to a list of values
df2['country'] = ['USA', 'United States'] * 4
df2

Unnamed: 0,city,state,country
0,san francisco,california,USA
1,phoenix,arizona,United States
2,seattle,washington,USA
3,dallas,texas,United States
4,denver,colorado,USA
5,chicago,illinois,United States
6,portland,oregon,USA
7,miami,florida,United States


In [16]:
# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)
df2['country'].replace('United States', 'USA')

0    USA
1    USA
2    USA
3    USA
4    USA
5    USA
6    USA
7    USA
Name: country, dtype: object

In [11]:
#that didn't do anything to our dataframebecause .str.replace() returns the updated version - it doesn't perform the operation in place
df2

Unnamed: 0,city,state,country
0,san francisco,california,USA
1,phoenix,arizona,United States
2,seattle,washington,USA
3,dallas,texas,United States
4,denver,colorado,USA
5,chicago,illinois,United States
6,portland,oregon,USA
7,miami,florida,United States


In [12]:
df2.columns

Index(['city', 'state', 'country'], dtype='object')

In [10]:
# we need to capture the updated values when they get returned
df2['country'] = df2['country'].str.replace('United States', 'USA')
df2

Unnamed: 0,city,state,country
0,san francisco,california,USA
1,phoenix,arizona,USA
2,seattle,washington,USA
3,dallas,texas,USA
4,denver,colorado,USA
5,chicago,illinois,USA
6,portland,oregon,USA
7,miami,florida,USA


In [11]:
# you can change the column names
df2.columns = ['city_name', 'state_name', 'nation']
df2

Unnamed: 0,city_name,state_name,nation
0,san francisco,california,USA
1,phoenix,arizona,USA
2,seattle,washington,USA
3,dallas,texas,USA
4,denver,colorado,USA
5,chicago,illinois,USA
6,portland,oregon,USA
7,miami,florida,USA


In [12]:
# you can save your DataFrame as a csv file
#df2.to_csv('data/my_data.csv') add your system path 

# Dropping and editing values¶

In [23]:
# there are lots of ways to create dataframes
list_of_tuples = [('sf', 2012), ('phx', np.nan), ('phx', 2005), ('chi', 2009)]
df = pd.DataFrame(list_of_tuples, columns=['city', 'year'])
df['country'] = 'USA'
df['continent'] = 'North America'
df

Unnamed: 0,city,year,country,continent
0,sf,2012.0,USA,North America
1,phx,,USA,North America
2,phx,2005.0,USA,North America
3,chi,2009.0,USA,North America


In [28]:
# you can remove a column from a dataframe with the .drop() method by referencing its label and axis
# axis 0 = rows
# axis 1 = columns
df2 = df.drop('country', axis=1, inplace=False)
df2

Unnamed: 0,city,year,continent
0,sf,2012.0,North America
1,phx,,North America
2,phx,2005.0,North America
3,chi,2009.0,North America


In [19]:
# you can also remove a column from a dataframe with the del() function
df3 = pd.DataFrame(df2)
del(df3['continent'])
df3

Unnamed: 0,city,year
0,sf,2012.0
1,phx,
2,phx,2005.0
3,chi,2009.0


In [20]:
# you can use the len() function to check the row count of a DataFrame
len(df3)

4

In [21]:
# you can also use .count() method to check the row count, but this excludes NaNs
df3.count()

city    4
year    3
dtype: int64

In [22]:
df3.isnull().sum()

city    0
year    1
dtype: int64

In [18]:
# or you can use the .shape attribute to get the shape of the DataFrame
df3.shape

(4, 2)

An attribute is different than a method or a function. Notice it doesn't use parentheses like .shape()



In [19]:
# you can get a count of values that appear in some column
df3['city'].value_counts()

phx    2
sf     1
chi    1
Name: city, dtype: int64

In [23]:
df3.drop_duplicates()

Unnamed: 0,city,year
0,sf,2012.0
1,phx,
2,phx,2005.0
3,chi,2009.0


In [24]:
# you can drop rows that contain duplicate values in some specified column
df4 = df3.drop_duplicates('city')
df4

Unnamed: 0,city,year
0,sf,2012.0
1,phx,
3,chi,2009.0


look at the index above. remember that it's not a row counter, it's an index of row labels

In [25]:
# back to our earlier dataframe
df

Unnamed: 0,city,year,country,continent
0,sf,2012.0,USA,North America
1,phx,,USA,North America
2,phx,2005.0,USA,North America
3,chi,2009.0,USA,North America


In [22]:
# you can perform operations across an entire Series (aka column in our DataFrame) at once
df['year5'] = df['year'] + 5
df

Unnamed: 0,city,year,country,continent,year5
0,sf,2012.0,USA,North America,2017.0
1,phx,,USA,North America,
2,phx,2005.0,USA,North America,2010.0
3,chi,2009.0,USA,North America,2014.0
