In [None]:
# importing all the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 


### Classes 

In [None]:
class Dog:

    tricks = []             # mistaken use of a class variable

    def __init__(self, name):
        self.name = name
        #self.tricks =[]  # creates an empty list for each dog

    def add_trick(self, trick):
        self.tricks.append(trick)

d = Dog('Fido')
e = Dog('Buddy')
d.add_trick('roll over')
e.add_trick('play dead')
d.tricks                # unexpectedly shared by all dogs


### Pandas 

In [None]:
# making a dataframe
data = {'color':['red','yellow','blue','orange'],'number':[100,500,400,300],'diameter':[1.0,2.5,1.5,3.0]}
frame = pd.DataFrame(data)

In [None]:
# visualizing the frame
frame


In [None]:
# missing values 
frame2 = pd.DataFrame(data, columns = ['color','number','diameter','shape'],
                      index = ['one','two','three','four'])
frame2

In [None]:
#retrieving a row
frame2.loc['two']

In [None]:
# substituting for missing values
frame2['shape'] = 'circle'
frame2

In [None]:
# substituting for missing values
frame2['shape'] = ['circle','oval','triangle','square']
frame2

In [None]:
# you can also use a series to insert specific missing values at given locations 
val = pd.Series(['circle','oval','triangle'],index=['two','three','four'])
frame2['shape'] = val
frame2

### Loading data with pandas

Now we can use the ``read_csv`` command to read the comma-separated-value data.  This command is pretty sophisticated.  It can read data via a URL (Uniform Resource Locator, see Lecture 2).  Not only that, it can load data from a `.zip` file by on the fly decompressing it and opening the first `.csv` it finds.  You can open different `.csv` files in the `.zip` file with additional arguments.  See the [docs](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) for more information.

In [None]:
# reading a csv file using pandas
data = pd.read_csv('http://faculty.washington.edu/dacb/HCEPDB_moldata.zip')

In [None]:
data.head()

The ``shape`` attribute shows us the number of elements:

```
data.shape
```

In [None]:
data.shape

The ``columns`` attribute gives us the column names

```
data.columns
```

In [None]:
data.columns

#### Index Manipulations

The ``index`` attribute gives us the index names

```
data.index
```
Let's make our ``id`` column the ``index``

```
data.set_index('id', inplace=True)
```

*Note:* the use of `inplace=True`.  This cases the original data frame to be modified *in place* instead of creating a new data frame and returning the result to be stored in a new variable.

In [None]:
data.index

In [None]:
data.set_index('id', inplace=True)

In [None]:
data.index

The ``dtypes`` attribute gives the data types of each column:

In [None]:
data.dtypes

### Manipulating data with Pandas 

In [None]:
data['mass_ratio_H2O'] = data['mass'] / 18.01528

In [None]:
data.head()

#### Data Grouping

* In preparation for grouping the data, let's bin the molecules by their molecular mass. For that, we'll use ``pd.cut``.  Documentation of [cut](https://pandas.pydata.org/docs/reference/api/pandas.cut.html).  Cut is used when you want to bin numeric values into discrete intervals.  This is useful for discretizing continuous data and for making histograms.

```
data['mass_group'] = pd.cut(data['mass'], 10)
```

In [None]:
data['mass_group'] = pd.cut(data['mass'], 10)
data.head()

* The ``pandas.value_counts`` returns statistics on the unique values within each column. We can use it, for example, to break down the molecules by their mass group that we just created:

In [None]:
pd.value_counts(data['mass_group'])

In [None]:
pd.value_counts(data['mass'])

In [None]:
pd.value_counts(data['pce'])

In [None]:
data.groupby(['mass_group']).count()


In [None]:
data.groupby(['mass_group'])['pce'].mean()

In [None]:
data.groupby(['mass_group'])['pce'].describe()

In [None]:
# reading an excel file using pandas
#df_2 = pd.read_excel('./data/Iris_excel.xlsx', index_col=0)

### Data Visualization

#### Scatter Plots 

In [None]:
df = data.sample(500, random_state=200)

In [None]:
df.shape

In [None]:
# plotting with python 
plt.scatter(df['jsc'], df['voc'])
#asthetics
plt.xlabel('Jsc')
plt.ylabel('VOC')
plt.title('VOC vs Jsc')
#Show the plot
plt.show()



In [None]:
#changing the color of the markers
plt.scatter(df['jsc'], df['voc'], color='red')
plt.xlabel('Jsc')
plt.ylabel('VOC')
plt.title('VOC vs Jsc')

In [None]:
# changing marker type and size
plt.scatter(df['jsc'], df['voc'], color='green', marker='s', s=10)
plt.xlabel('Jsc')
plt.ylabel('VOC')
plt.title('VOC vs Jsc')

#### Subplots

In [None]:
# Plotting two types of data points in the same subplot
small_mass = df[df['mass'] < 500] # data has mass less than 300
large_mass = df[df['mass'] >= 500] # data has mass greater than 300

fig, ax = plt.subplots()

# plot the different data groups as different colors
ax.scatter(small_mass['jsc'], small_mass['voc'], color='green', label='mass < 500')
ax.scatter(large_mass['jsc'], large_mass['voc'], color='blue', label='mass >= 500')

ax.set_xlabel('Jsc')
ax.set_ylabel('VOC')
ax.set_title('VOC vs Jsc')
plt.legend() # add a legend

In [None]:
# subplots with 2 plots in two rows
fig, axes = plt.subplots(2,1, figsize=(5,10), sharex=True)

print(axes) # axes is now an array of axes objects instead of just one

# first plot
axes[0].scatter(df['jsc'], df['voc'])
axes[0].set_xlabel('Jsc')
axes[0].set_ylabel('VOC')

# second plot
axes[1].scatter(df['jsc'], df['mass'])
axes[1].set_xlabel('Jsc')
axes[1].set_ylabel('Mass')