In [4]:
import pandas as pd
import numpy as np

# Creating dataframes

* From dictionary: https://www.geeksforgeeks.org/how-to-create-dataframe-from-dictionary-in-python-pandas/
* From list: https://www.geeksforgeeks.org/create-a-pandas-dataframe-from-lists/

In [5]:
# from a dictionary

d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [6]:
# from numpy array

df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [7]:
# from list

lst = ['Geeks', 'For', 'Geeks', 'is', 
            'portal', 'for', 'Geeks']
  
# Calling DataFrame constructor on list
df = pd.DataFrame(lst)
df

Unnamed: 0,0
0,Geeks
1,For
2,Geeks
3,is
4,portal
5,for
6,Geeks


# Opening files

* CSV: https://pythonbasics.org/read-csv-with-pandas/
* Excel https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
* JSON https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html

In [1]:
# Selecting files

import tkinter as tk
from tkinter import filedialog

root = tk.Tk()
root.lift()
root.withdraw()

print('Opening dialogue box for file selection. Please choose a file.')

file_path = filedialog.askopenfilename()

print('File selected:', file_path)

Opening dialogue box for file selection. Please choose a file.
File selected: C:/Users/cimacint/Documents/Junk/PandasDemo/WebColors.csv


In [3]:
# create a dataframe from a csv file

# Load pandas
import pandas as pd

# Read CSV file into DataFrame df
# Try this with the index_col removed also
df = pd.read_csv(file_path, index_col=0)

# Show dataframe
print(df)

       name      hex
id                  
1     White  #FFFFFF
2    Silver  #C0C0C0
3      Gray  #808080
4     Black  #000000
5       Red  #FF0000
6    Maroon  #800000
7    Yellow  #FFFF00
8     Olive  #808000
9      Lime  #00FF00
10    Green  #008000
11     Aqua  #00FFFF
12     Teal  #008080
13     Blue  #0000FF
14     Navy  #000080
15  Fuchsia  #FF00FF
16   Purple  #800080


In [4]:
# Selecting files

import tkinter as tk
from tkinter import filedialog

root = tk.Tk()
root.lift()
root.withdraw()

print('Opening dialogue box for file selection. Please choose a file.')

file_path = filedialog.askopenfilename()

print('File selected:', file_path)

Opening dialogue box for file selection. Please choose a file.
File selected: C:/Users/cimacint/Documents/Junk/PandasDemo/WebColors.xlsx


In [7]:
# create a dataframe from an Excel file

# Load pandas
import pandas as pd

# Read CSV file into DataFrame df
# Try this with the index_col removed also
df = pd.read_excel(file_path, index_col=0)

# Show dataframe
print(df)

       name      hex
id                  
1     White  #FFFFFF
2    Silver  #C0C0C0
3      Gray  #808080
4     Black  #000000
5       Red  #FF0000
6    Maroon  #800000
7    Yellow  #FFFF00
8     Olive  #808000
9      Lime  #00FF00
10    Green  #008000
11     Aqua  #00FFFF
12     Teal  #008080
13     Blue  #0000FF
14     Navy  #000080
15  Fuchsia  #FF00FF
16   Purple  #800080


In [14]:
# create json file from the dataframe

import os
  
# this will return a tuple of root and extension
split_tup = os.path.splitext(file_path)
print(split_tup)

df.to_json(str(split_tup[0]) + '.json', orient='index')

print('Done.')

('C:/Users/cimacint/Documents/Junk/PandasDemo/WebColors', '.xlsx')
Done.


In [15]:
# Selecting files

import tkinter as tk
from tkinter import filedialog

root = tk.Tk()
root.lift()
root.withdraw()

print('Opening dialogue box for file selection. Please choose a file.')

file_path = filedialog.askopenfilename()

print('File selected:', file_path)

Opening dialogue box for file selection. Please choose a file.
File selected: C:/Users/cimacint/Documents/Junk/PandasDemo/WebColors.json


In [19]:
# create a dataframe from a json file

# Load pandas
import pandas as pd

# Read CSV file into DataFrame df
# Try this with the orient = removed also
df = pd.read_json(file_path, orient = 'index')

# Show dataframe
print(df)

       name      hex
1     White  #FFFFFF
2    Silver  #C0C0C0
3      Gray  #808080
4     Black  #000000
5       Red  #FF0000
6    Maroon  #800000
7    Yellow  #FFFF00
8     Olive  #808000
9      Lime  #00FF00
10    Green  #008000
11     Aqua  #00FFFF
12     Teal  #008080
13     Blue  #0000FF
14     Navy  #000080
15  Fuchsia  #FF00FF
16   Purple  #800080


# Introduction to Pandas
https://www.tutorialspoint.com/python_pandas/index.htm

## Data Structures

I've never used panels

In [21]:
# Data structures
print(type(df['name']))
print(type(df))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


## Sorting

In [23]:
# by column values
print(df.sort_values(by='name'))
print(df.sort_values(by='hex'))

       name      hex
11     Aqua  #00FFFF
4     Black  #000000
13     Blue  #0000FF
15  Fuchsia  #FF00FF
3      Gray  #808080
10    Green  #008000
9      Lime  #00FF00
6    Maroon  #800000
14     Navy  #000080
8     Olive  #808000
16   Purple  #800080
5       Red  #FF0000
2    Silver  #C0C0C0
12     Teal  #008080
1     White  #FFFFFF
7    Yellow  #FFFF00
       name      hex
4     Black  #000000
14     Navy  #000080
13     Blue  #0000FF
10    Green  #008000
12     Teal  #008080
9      Lime  #00FF00
11     Aqua  #00FFFF
6    Maroon  #800000
16   Purple  #800080
8     Olive  #808000
3      Gray  #808080
2    Silver  #C0C0C0
5       Red  #FF0000
15  Fuchsia  #FF00FF
7    Yellow  #FFFF00
1     White  #FFFFFF


In [24]:
# sort by index
df.sort_index(ascending=False)

Unnamed: 0,name,hex
16,Purple,#800080
15,Fuchsia,#FF00FF
14,Navy,#000080
13,Blue,#0000FF
12,Teal,#008080
11,Aqua,#00FFFF
10,Green,#008000
9,Lime,#00FF00
8,Olive,#808000
7,Yellow,#FFFF00


In [25]:
# sort columns
df.sort_index(axis=1)

Unnamed: 0,hex,name
1,#FFFFFF,White
2,#C0C0C0,Silver
3,#808080,Gray
4,#000000,Black
5,#FF0000,Red
6,#800000,Maroon
7,#FFFF00,Yellow
8,#808000,Olive
9,#00FF00,Lime
10,#008000,Green


## Data Manipulation

In [27]:
# Add column to dataframe

df['length'] = df['name'].str.len()
df

Unnamed: 0,name,hex,length
1,White,#FFFFFF,5
2,Silver,#C0C0C0,6
3,Gray,#808080,4
4,Black,#000000,5
5,Red,#FF0000,3
6,Maroon,#800000,6
7,Yellow,#FFFF00,6
8,Olive,#808000,5
9,Lime,#00FF00,4
10,Green,#008000,5


In [30]:
# select rows by condition

print(df['length'] > 4)
result_df = df[df['length'] > 4]
result_df

1      True
2      True
3     False
4      True
5     False
6      True
7      True
8      True
9     False
10     True
11    False
12    False
13    False
14    False
15     True
16     True
Name: length, dtype: bool


Unnamed: 0,name,hex,length
1,White,#FFFFFF,5
2,Silver,#C0C0C0,6
4,Black,#000000,5
6,Maroon,#800000,6
7,Yellow,#FFFF00,6
8,Olive,#808000,5
10,Green,#008000,5
15,Fuchsia,#FF00FF,7
16,Purple,#800080,6


In [32]:
# join dataframes

import numpy as np
import pandas as pd

# create a new dataframe with random numbers

data = np.random.randint(5,30,size=16)
df2 = pd.DataFrame(data, columns=['random_numbers'])

print(df2)

# performs an outer join (uses the index as the key)

df = df.join(df2)
df

    random_numbers
0               18
1               16
2               11
3               10
4                8
5                6
6               19
7               20
8               28
9               23
10               7
11              12
12              22
13              12
14              20
15              27


Unnamed: 0,name,hex,length,random_numbers
1,White,#FFFFFF,5,16.0
2,Silver,#C0C0C0,6,11.0
3,Gray,#808080,4,10.0
4,Black,#000000,5,8.0
5,Red,#FF0000,3,6.0
6,Maroon,#800000,6,19.0
7,Yellow,#FFFF00,6,20.0
8,Olive,#808000,5,28.0
9,Lime,#00FF00,4,23.0
10,Green,#008000,5,7.0
