# Crash course on python

## Lists
Lists are sequences of items (can be different types). They are defined as comma separated items inside square brackets

In [None]:
myList = ['abc', 20, 3.14, 400, 7.5, 88]

We can access individual members of a list  using square bracket “array” notation

In [None]:
myList[1]

Or you can even get an item from the list counting from right to left (Notice the negative index!)

In [None]:
myList[-5]

## Slicing
You can get a subset of the list very easily in python

In [None]:
myList[1:4] #(excluding element 4!!)

In [None]:
# Take from the element 1 up to the end
myList[1:]

In [None]:
# Take from start up to element 4 (excluding 4!!)
myList[:4]

In [None]:
# To make a copy of an entire sequence, you can use [:]
myList[:]

In [None]:
list1 = myList     # chainging one affects both list1 and myList
list1.append(111)
print(myList)
print(list1)

In [None]:
list2 = myList[:]  # two independent copies 
print(myList)
list2.append(222)
print(myList)
print(list2)

## The `in` operator

In [None]:
20 in myList

In [None]:
21 in myList

In [None]:
myName = 'Python'
'Py' in myName

In [None]:
'py' in myName

In [None]:
for items in myList:
    print (items)

In [None]:
for i,items in enumerate(myList):
    print (i,items)

## Dictionaries: A Mapping type
Dictionaries store a mapping between a set of keys and a set of values.

Values can be any type
A single dictionary can store values of different types
You can define, modify, view, lookup, and delete the key-value pairs in the dictionary.

In [None]:
d = {'user':'dina', 'pwd':1234}
d['user']

In [None]:
d['pwd']

In [None]:
d['user'] = 'dinos'
d

In [None]:
d['age'] = 42
d

In [None]:
d.keys()  # List of keys

In [None]:
d.values()

In [None]:
d.items()

In [None]:
del d['user']
d

In [None]:
d.clear()
d

# Numpy

## Create a numpy array

In [None]:
import numpy as np

In [None]:
#Create a 1D ndarray
a = np.array([1, 2, 3, 4, 5])

In [None]:
# dimension(rank) of the array
a.ndim

In [None]:
# number of elements in the array
a.size

In [None]:
# the size of the array in each dimension
a.shape

In [None]:
# check the type of the array
type(a)

In [None]:
# check the type of the elements in the array
a.dtype

In [None]:
# you can also define the datatype of the array at creation. Look what happens in you select an int and have float values in it
another_array = np.array([3.14, 5, 6.4], dtype=np.int64)
print(another_array)
print(another_array.dtype)

## Basic mathematical operations

In [None]:
# Create 2 arrays
v = np.array([1,2])
u = np.array([3,1])

#### Addition

In [None]:
z = v+u
print('z :',z)

In [None]:
# or with np.add()
z= np.add(v,u)
print('z :',z)

In [None]:
## Adding a constant
z = z+1
print('z +1 :',z)

In [None]:
z = np.add(1,z)
print('z +1 :',z)

#### Subtraction

In [None]:
z = v-u
print(z)
## or
z = np.subtract(v,u)
print(z)

#### Multiplication

In [None]:
w = 2*z # multiplication by scalar
print('w :',w)

In [None]:
q = z * w # multiplication of 2 arrays
print('z*w :',q)

In [None]:
q = np.dot(z,w)
print('q :',q)

#### Division

In [None]:
z = v / u
print(z)
# Or with np.divide
z = np.divide(v,u)
print(z)

## 2D arrays

In [None]:
a=np.array([ [1,2,3], [4,5,6], [7,8,9] ])
print(a)

In [None]:
b=np.array([ [10,11,12], [13,14,15], [16,17,18] ])
print(b)

In [None]:
a*b

In [None]:
a+b

In [None]:
a.ndim # number of nested lists - array dimension

In [None]:
a.shape # returns a tuple showing (number of nested lists(rows), number of elements in each list(columns) )

In [None]:
a.size # number of elements ( rows x columns)

In [None]:
a*b

In [None]:
np.dot(a,b)

In [None]:
# Lets create another 2d array
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10,11,12], [13,14,15]])
print (a)

**You can think of a ROOT TTree containing variables as a NumPy array where every row is a different event and every column is a different variable.**

In [None]:
# What is the dimensions of the array(i.e how many events and variables are there?)
a.shape

In [None]:
# min/max element along each row (axis=1), each column (axis=0)
a.min(axis=1)

In [None]:
# Take the min from each row and add together
minArray = a.min(axis=1)
minArray.sum()

In [None]:
minArray.mean()

In [None]:
# get a specific element in [row,column], here row=1, column=2
a[1,2]

## Slicing and selections

### Slicing

In [None]:
# create a new array with only the first 3 rows(=events!)
b=a[:3] #This creates a copy
print (b)
type(b)

In [None]:
# create a new array with only the first 3 rows(=events!) and only 1-st and 3-rd column(=variables!)
c=a[:3,[0,2]]
print (c)

In [None]:
# Notice the difference
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10,11,12], [13,14,15]])
print (a)

In [None]:
row_rank1 = a[1,:]
row_rank2 = a[1:2,:]
print(row_rank1, row_rank1.shape) # rank1
print(row_rank2, row_rank2.shape) # rank2

### Selections and filtering

You can create a filter on the values of the numpy array based on some criteria

In [None]:
a = np.array([[11,12], [21,22], [31,32]])
print(a)

In [None]:
#Lets create a filter e.g the values above 13
filter  = (a>13)
print(a)
print(filter)

In [None]:
# You can then select the values that pass the filter like this
a[filter]

In [None]:
# Or with more complex selections
a[ (a>13) & (a<30)]

## Other useful functions

In [None]:
np.linspace(-2,2,num=5)

In [None]:
x = np.linspace(0 , 2*np.pi,100)
y = np.sin(x)
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(x,y)
plt.savefig('sin.png')

In [None]:
# The arange function accepts the starting and ending elements of a range, followed by the interval.
a = np.arange(1, 15, 2)
a

In [None]:
# Generate some values
arr = np.random.randn(2,5)
arr

In [None]:
print(arr.mean()) # mean of all elements
print(arr.mean(axis=1)) # mean per row
print(arr.mean(axis=0)) # mean per column
print(arr.sum()) # sum
print(np.median(arr,axis=1))

#### Sorting

In [None]:
arr = np.random.randn(10)
arr

In [None]:
arr.sort()
arr

# Pandas

In [None]:
import pandas as pd

In [None]:
# Create a panda series. Index is 0,1,...
my_series = pd.Series(data=[100, 200, 300, 400])
my_series

In [None]:
# With custom indices
my_series = pd.Series(data=[100, 200, 300, 400], index=['jet1', 'jet2', 'jet3', 'jet4'])
my_series

In [None]:
# With different data types
my_series = pd.Series(data=[100, 'abc', 300, 'jkl'], index=['jet1', 'jet2', 'jet3', 'jet4'])
my_series

## Selection  and filtering

In [None]:
# Remember the indices of the series
my_series.index

In [None]:
# Access value of element at a particular index
my_series['jet2']

In [None]:
# Access value of element at a particular index with loc (recommended!!)
my_series.loc['jet2']

In [None]:
my_series.loc[['jet1','jet2']]

## DataFrame

In [None]:
# Example of creating one DataFrame from series
d= { 'pt' : pd.Series(np.random.uniform(low=0., high=1000., size=(3,)),index=['jet1','jet2','jet3'] ),
     'eta' : pd.Series(np.random.uniform(low=-2.5, high=2.5, size=(3,)),index=['jet1','jet2','jet3'] ),
     'phi' : pd.Series(np.random.uniform(low=-3.2, high=3.2, size=(4,)),index=['jet1','jet2','jet3','jet4'] )
    
   }
df_new = pd.DataFrame(d)
df_new                      

In [None]:
df_new.columns

In [None]:
# Example of creating one DataFrame from a dictionary
lepType = ['Muon','Electron','Tau']
df = pd.DataFrame({ 'lep1_Pt'  : np.random.uniform(low=0., high=1000., size=(15,)),
                    'lep1_Eta' : np.random.uniform(low=-2.5, high=2.5, size=(15,)),
                    'lep1_Phi' : np.random.uniform(low=-3.2, high=3.2, size=(15,)),
                    'ParticleType' : np.random.choice(lepType,size=15)
                  })
# print it
df

In [None]:
# Returns column with label lep1Pt as Series
df['lep1_Pt'] 
# df.iloc[:,0]

In [None]:
#Create a new column
df['lep1_charge'] = 1
df

In [None]:
#Delete a columns from the dataframe
df_save = df.pop('lep1_charge')
df
# or by del
# del df['lep1_charge']

In [None]:
df_save

In [None]:
#Slice, filter etc
df[df.lep1_Eta > 0] #filtering by variable value

In [None]:
# Get the first 3 rows (events)
df[:3]

In [None]:
# Select a subset of the variables
VariablesToUse = ['lep1_Pt','lep1_Eta']
df2 = df[VariablesToUse]
df2

In [None]:
# Group by a label
df.groupby('ParticleType').lep1_Pt.mean()

## Data statistics

In [None]:
df.describe() 

In [None]:
df.mean()

In [None]:
df.min()

In [None]:
df.max()

In [None]:
# df.mode()

In [None]:
df.corr()

In [None]:
df.median()

In [None]:
df.std()

In [None]:
filter1 = df['lep1_Pt'] >800
filter1.any()

In [None]:
print(filter1,df['lep1_Pt'])

In [None]:
filter2 = df['lep1_Pt'] >0
filter2.all()

In [None]:
df.iloc[1,1] = None

In [None]:
df

In [None]:
df.isnull().any()

In [None]:
df1 = df.dropna()

In [None]:
df1

In [None]:
df2 = df.interpolate()
df2

# Visualization

In [None]:
df[['lep1_Eta','lep1_Phi']].plot.bar()

In [None]:
df[['lep1_Eta','lep1_Phi']].plot.hist()

In [None]:
df[['lep1_Eta','lep1_Phi']].plot()

In [None]:
df.hist(column='lep1_Pt', figsize=(15,10))

## Adding DataFrames

Lets first create 2 dataframes

In [None]:
nvalues = 5
df1 = pd.DataFrame({'var1'  : np.random.randn(nvalues),
                    'var2' : np.random.randn(nvalues),
                    'var3' : np.random.randn(nvalues),
                    'var4'  : np.random.randn(nvalues)})

df2 = pd.DataFrame({'var1'  : np.random.randn(nvalues),
                    'var2' : np.random.randn(nvalues),
                    'var3' : np.random.randn(nvalues),
                    'var4'  : np.random.randn(nvalues)})
# print it
df1

In [None]:
df2

In [None]:
df_result1 = pd.concat([df1,df2])
df_result1

In [None]:
df_result2 = pd.concat([df1,df2],ignore_index=True)
df_result2

In [None]:
# Lets add anothe column to df2
df2['NewVar1'] = np.random.randn(nvalues)
df2

In [None]:
df_result3 = pd.concat([df1,df2])
df_result3

# Exercises

#### 1. Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

In [None]:
#input
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

#### 2. Compute the mean squared error of truth and pred series.

In [None]:
# Input
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

#### 3. Filter every nth row in a dataframe

From df, filter the 'Manufacturer', 'Model' and 'Type' for every 11th row starting from 1st (row 0).


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

#### 4. Grouping by a label
Import the alcohol consumption dataset. 
- Which continent drinks more beer on average?
- For each continent print the statistics for wine consumption

In [None]:
# Input
drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv')
drinks.head()