# Pandas: exploring, loading and saving data
- selection and slicing: selecting columns and rows
- adding rows and columns
- understanding the data 
- selecting based on a criteria

In [None]:
# Import pandas library
import pandas as pd
import os

## Two ways of creating the same DataFrame

### Creating a dictionary containing employee data and defining a DataFrame from it

In [None]:
data = {'Num':[1, 2, 3, 4, 5, 6],
        'Name':['Trevor', 'Princi', 'Peter', 'Gaurav', 'Anuj', "Peter"],
        'Age':[27, 24, 50, 22, 32, 25],
        'City':['London', 'Paris', "Lisbon", 'Paris', 'Paris', "Madrid"],
        'Degree':['MSc', 'MA', "PhD", 'MCA', 'Phd', "MSc"]
       }
data

In [None]:
df = pd.DataFrame(data)
df

### Creating a DataFrame from a list of records

In [None]:
data = [
    [1, 'Trevor', 27, 'London', 'MSc'],
    [2, 'Princi', 24, 'Paris', 'MA'],
    [3, 'Peter', 50, 'Lisbon', 'PhD'],
    [4, 'Gaurav', 22, 'Paris', 'MCA'],
    [5, 'Anuj', 32, 'Paris', 'Phd'],
    [6, 'Peter', 25, 'Madrid', 'MSc']
]
df = pd.DataFrame(data, columns=["Num", "Name", "Age", "City", "Degree"])
df

Actually, we could create the same DataFrame in many other ways ...

In [None]:
df = df.set_index("Num")
df

## Accessing elements

selecting columns ...

In [None]:
df["Name"]

In [None]:
# selects two columns
df[['Name', 'Degree']]

### Indexers: iloc, loc
When selecting rows, slicing and indexing conventions can be a source of confusion...
- iloc: implicit integer index (positions in a vector)
- loc : explicit integer index (uses the actual index value)

In [None]:
df

In [None]:
# implicit index when slicing
df.iloc[1:3]

In [None]:
# explicit index when slicing
df.loc[1:3]

In [None]:
# Will it use implitic or explicit indexing?
df[1:3]

In [None]:
# implicit integer index
df.iloc[1]

In [None]:
# explicit index when indexing
df.loc[1]

Two ways of getting the same value...

In [None]:
df.loc[4]["Age"]

In [None]:
df.loc[4, "Age"]

## Adding a new column

In [None]:
df["Country"] = ["IN","FR","PT","FR","FR","SP"]
df["Score"] = [18, 16, 19, 20, 16, 18]
df

## Adding rows

In [None]:
df.loc["Mike"] = ["Michael", 33, "London", "MA", "UK", 15]
df

## Check if a value exists

In [None]:
'Mike' in df.index

In [None]:
# How is this possible ?
'Mike' in df["City"]

In [None]:
'Mike' in df["City"].values

## Changing Rows

In [None]:
df.loc["Mike", "Age"] += 1
df

In [None]:
df.loc[1] = ["Pierre", 21, "Avignon", "MA", "BR", 14]
df

In [None]:
# Note that the following won't work as expected: We are extracting a series and changing only that series
df.iloc[1]["Age"] += 1
df

## Understanding your data

In [None]:
df.describe()

In [None]:
df.head(3)

In [None]:
df.tail()

In [None]:
df[["City","Country"]].value_counts()

In [None]:
import matplotlib.pyplot as plt
#plt.rcParams.update({'font.size': 20, 'figure.figsize': (10, 8)}) # set font and plot size to be larger

In [None]:
df.plot(kind='scatter', x='Age', y='Score', title='Age vs Score');

In [None]:
df['Score'].plot(kind='hist', title='Score');
#df['Score'].plot(kind='line', title='Score');
#df['Score'].plot(kind='bar', title='Score');

In [None]:
df['Score'].plot(kind="box");

## Selecting data based on criteria

In [None]:
df["City"] == "Paris"

In [None]:
df[df["City"] == "Paris"]

In [None]:
df[ df["City"] == "Paris" ]["Age"].mean()

In [None]:
(df["City"] == "Paris") | (df["City"] == "London") 

In [None]:
df[ (df["City"] == "Paris") | (df["City"] == "London") ]

In [None]:
df[ df["City"].isin(["Paris", "London"]) ]

In [None]:
df[ (df["City"].isin(["Paris", "London"])) & ( df["Age"] > 25 ) ]

# Reading data from a file and writing data to a file

### Writing to CSV, JSON, Excel and SQL files

If you are using google colab, the following code can be used to access folders in your Google Drive

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive')
    #glob.glob('*')

In [None]:
if not os.path.exists('data'):
    os.makedirs('data')

It’s quite simple to save and load data from various file formats into a DataFrame.

In [None]:
df.to_csv('data/people.csv')
#files.download('data/people.csv')

In [None]:
df.to_json('data/people.json')

### Reading from CSV files

In [None]:
df2 = pd.read_csv('data/people.csv')
df2

CSVs don't have indexes like our DataFrames, so all we need to do is just designate the `index_col` when reading:

In [None]:
df2 = pd.read_csv('data/people.csv', index_col="Num")
df2

If you have a JSON file — which is essentially a stored Python `dict` — pandas can read this just as easily:

### Reading data from JSON

If you have a JSON file — which is essentially a stored Python `dict` — pandas can read this just as easily:

In [None]:
df2 = pd.read_json('data/people.json')
df2

Just like with CSVs, we could pass index_col='name', but we can also set an index after-the-fact:

In [None]:
df2 = df2.set_index('index')
df2

### Reading and writing Excel files

Warning: the openpyxl library is required. You can do so using the following command

In [None]:
df.to_excel('data/people.xlsx')

In [None]:
df2 = pd.read_excel('data/people.xlsx', index_col=0)
df2

### Reading and writing to a SQL database

If you’re working with data from a SQL database you need to first establish a connection using an appropriate Python library, then pass a query to pandas. Here we'll use SQLite to demonstrate. 

In [None]:
import sqlite3

In [None]:
con = sqlite3.connect("data/people.sqlite3")
df.to_sql('people', con)

In [None]:
import sqlite3
con = sqlite3.connect("data/people.sqlite3")
df2 = pd.read_sql_query("SELECT * FROM people", con) # index_col='index'
df2