In [55]:
import pandas as pd
import numpy as np
from datetime import datetime
import math

In [2]:
# Pandas Series

series = pd.Series([1,2,3,4,5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [3]:
# Create using specified index
series = pd.Series([1,2,3,4,5], index=['a','b','c','d','c'])
print(series)

a    1
b    2
c    3
d    4
c    5
dtype: int64


In [4]:
# Accessing elements in a series
print(series[2])
print(series.iloc[2])

3
3


In [5]:
print(series['d'])
print(series.loc['d'])

4
4


In [6]:
print(series['c'])

c    3
c    5
dtype: int64


In [7]:
print(series[2:])
print(series.iloc[2:])

c    3
d    4
c    5
dtype: int64
c    3
d    4
c    5
dtype: int64


In [8]:
# Specifying a datetime range as the index of a series
dates1 = pd.date_range('20190525', periods=12)
print(dates1)

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03', '2019-06-04', '2019-06-05'],
              dtype='datetime64[ns]', freq='D')


In [9]:
series = pd.Series([1,2,3,4,5,6,7,8,9,10,11,12])
series.index = dates1
print(series)

2019-05-25     1
2019-05-26     2
2019-05-27     3
2019-05-28     4
2019-05-29     5
2019-05-30     6
2019-05-31     7
2019-06-01     8
2019-06-02     9
2019-06-03    10
2019-06-04    11
2019-06-05    12
Freq: D, dtype: int64


In [10]:
# change the frequency parameter to month. will create datetime index of month intervals

dates2 = pd.date_range('2019-05-01', periods=12, freq='M')
print(dates2)

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')


In [11]:
# start date paramter doesn't require hyphens
dates_test = pd.date_range('20190501', periods=12, freq='M')
dates_test

DatetimeIndex(['2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31',
               '2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'],
              dtype='datetime64[ns]', freq='M')

In [12]:
# MS as in month start freq
dates2 = pd.date_range('20190501', periods=12, freq='MS')
print(dates2)

DatetimeIndex(['2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
              dtype='datetime64[ns]', freq='MS')


In [13]:
# can also set time: (in hours here)

dates3 = pd.date_range('20190517 09:00:00', periods=8, freq='H')
dates3

DatetimeIndex(['2019-05-17 09:00:00', '2019-05-17 10:00:00',
               '2019-05-17 11:00:00', '2019-05-17 12:00:00',
               '2019-05-17 13:00:00', '2019-05-17 14:00:00',
               '2019-05-17 15:00:00', '2019-05-17 16:00:00'],
              dtype='datetime64[ns]', freq='H')

In [14]:
# Pandas Dataframe with dimension 10x4, 10 rows and 4 columns
df = pd.DataFrame(np.random.randn(10,4),
                 columns=list('ABCD'))
print(df)

          A         B         C         D
0  0.167796  0.532546  0.039801 -1.703596
1 -0.752633 -0.674010 -1.613684  0.229568
2 -1.401116 -1.817229  0.019446  0.340082
3 -0.512681  0.447899  0.323204  1.100412
4  2.664663  0.648563 -0.293664  0.940590
5 -2.222771  1.089603 -0.115089 -0.969619
6 -1.357984  0.476486  0.902178  1.065553
7 -0.795634 -0.812855  1.298812 -0.027194
8  0.569864  0.604282  2.365380  0.002124
9  0.116326  0.850134  0.420978  1.307046


In [15]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,A,B,C,D
0,0.187497,1.12215,-0.988277,-1.985934
1,0.360803,-0.562243,-0.340693,-0.986988
2,-0.040627,0.067333,-0.452978,0.686223
3,-0.279572,-0.702492,0.252265,0.958977
4,0.537438,-1.737568,0.714727,-0.939288


In [16]:
# Specifying the index in a df

days = pd.date_range('20190525', periods=10)
df.index = days
df

Unnamed: 0,A,B,C,D
2019-05-25,0.187497,1.12215,-0.988277,-1.985934
2019-05-26,0.360803,-0.562243,-0.340693,-0.986988
2019-05-27,-0.040627,0.067333,-0.452978,0.686223
2019-05-28,-0.279572,-0.702492,0.252265,0.958977
2019-05-29,0.537438,-1.737568,0.714727,-0.939288
2019-05-30,0.070011,-0.516443,-1.655689,0.246721
2019-05-31,0.001268,0.951517,2.10736,-0.108726
2019-06-01,-0.185258,0.85652,-0.686285,1.104195
2019-06-02,0.387023,1.706336,-2.452653,0.260466
2019-06-03,-1.054974,0.556775,-0.945219,-0.030295


In [17]:
print(df.index)

DatetimeIndex(['2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
               '2019-05-29', '2019-05-30', '2019-05-31', '2019-06-01',
               '2019-06-02', '2019-06-03'],
              dtype='datetime64[ns]', freq='D')


In [18]:
print(df.values)

[[ 1.874970e-01  1.122150e+00 -9.882770e-01 -1.985934e+00]
 [ 3.608030e-01 -5.622430e-01 -3.406930e-01 -9.869880e-01]
 [-4.062700e-02  6.733300e-02 -4.529780e-01  6.862230e-01]
 [-2.795720e-01 -7.024920e-01  2.522650e-01  9.589770e-01]
 [ 5.374380e-01 -1.737568e+00  7.147270e-01 -9.392880e-01]
 [ 7.001100e-02 -5.164430e-01 -1.655689e+00  2.467210e-01]
 [ 1.268000e-03  9.515170e-01  2.107360e+00 -1.087260e-01]
 [-1.852580e-01  8.565200e-01 -6.862850e-01  1.104195e+00]
 [ 3.870230e-01  1.706336e+00 -2.452653e+00  2.604660e-01]
 [-1.054974e+00  5.567750e-01 -9.452190e-01 -3.029500e-02]]


In [19]:
# Get descriptive statistics
print(df.describe())
print(df.mean(0)) # columns
print(df.mean(1)) # rows


               A          B          C          D
count  10.000000  10.000000  10.000000  10.000000
mean   -0.001639   0.174188  -0.444744  -0.079465
std     0.451656   1.049677   1.267397   0.971164
min    -1.054974  -1.737568  -2.452653  -1.985934
25%    -0.149100  -0.550793  -0.977513  -0.731648
50%     0.035640   0.312054  -0.569632   0.108213
75%     0.317476   0.927768   0.104025   0.579784
max     0.537438   1.706336   2.107360   1.104195
A   -0.001639
B    0.174188
C   -0.444744
D   -0.079465
dtype: float64
2019-05-25   -0.416141
2019-05-26   -0.382280
2019-05-27    0.064988
2019-05-28    0.057294
2019-05-29   -0.356173
2019-05-30   -0.463850
2019-05-31    0.737855
2019-06-01    0.272293
2019-06-02   -0.024707
2019-06-03   -0.368428
Freq: D, dtype: float64


### Extracting from dataframes:
.head(), .tail(), .iloc(), .loc()

In [20]:
print(df.head())

                   A         B         C         D
2019-05-25  0.187497  1.122150 -0.988277 -1.985934
2019-05-26  0.360803 -0.562243 -0.340693 -0.986988
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
2019-05-29  0.537438 -1.737568  0.714727 -0.939288


In [21]:
print(df.tail())

                   A         B         C         D
2019-05-30  0.070011 -0.516443 -1.655689  0.246721
2019-05-31  0.001268  0.951517  2.107360 -0.108726
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-06-02  0.387023  1.706336 -2.452653  0.260466
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295


In [22]:
# selecting a specific column. returns a series
print(df['A'])
print(df.A)

2019-05-25    0.187497
2019-05-26    0.360803
2019-05-27   -0.040627
2019-05-28   -0.279572
2019-05-29    0.537438
2019-05-30    0.070011
2019-05-31    0.001268
2019-06-01   -0.185258
2019-06-02    0.387023
2019-06-03   -1.054974
Freq: D, Name: A, dtype: float64
2019-05-25    0.187497
2019-05-26    0.360803
2019-05-27   -0.040627
2019-05-28   -0.279572
2019-05-29    0.537438
2019-05-30    0.070011
2019-05-31    0.001268
2019-06-01   -0.185258
2019-06-02    0.387023
2019-06-03   -1.054974
Freq: D, Name: A, dtype: float64


In [23]:
# Double brackets because passing in a LIST of column labels. Instead of a list, you get a dataframe.

print(df[['A','B']])

                   A         B
2019-05-25  0.187497  1.122150
2019-05-26  0.360803 -0.562243
2019-05-27 -0.040627  0.067333
2019-05-28 -0.279572 -0.702492
2019-05-29  0.537438 -1.737568
2019-05-30  0.070011 -0.516443
2019-05-31  0.001268  0.951517
2019-06-01 -0.185258  0.856520
2019-06-02  0.387023  1.706336
2019-06-03 -1.054974  0.556775


In [24]:
# Slicing based on row number. this extracts row number 2 thru 4 (not inclusive)
print(df[2:4])
print(df.iloc[2:4])
# return only rows 2 and 4:
print(df.iloc[[2,4]])
# print row 2:
print(df.iloc[2])

                   A         B         C         D
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
                   A         B         C         D
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
                   A         B         C         D
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-29  0.537438 -1.737568  0.714727 -0.939288
A   -0.040627
B    0.067333
C   -0.452978
D    0.686223
Name: 2019-05-27 00:00:00, dtype: float64


In [25]:
# Slicing based on row and column numbers

print(df.iloc[2:4, 1:4])

                   B         C         D
2019-05-27  0.067333 -0.452978  0.686223
2019-05-28 -0.702492  0.252265  0.958977


In [26]:
# Specific rows and columns:
print(df.iloc[[2,4], [1,3]])

                   B         D
2019-05-27  0.067333  0.686223
2019-05-29 -1.737568 -0.939288


In [27]:
# Slicing based on labels

print(df['20190601':'20190603'])

                   A         B         C         D
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-06-02  0.387023  1.706336 -2.452653  0.260466
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295


In [28]:
print(df.loc['20190601':'20190603'])

                   A         B         C         D
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-06-02  0.387023  1.706336 -2.452653  0.260466
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295


In [29]:
print(df.loc['20190601':'20190603', 'A':'C'])

                   A         B         C
2019-06-01 -0.185258  0.856520 -0.686285
2019-06-02  0.387023  1.706336 -2.452653
2019-06-03 -1.054974  0.556775 -0.945219


In [30]:
print(df.loc['20190601':'20190603', ['A', 'C']])

                   A         C
2019-06-01 -0.185258 -0.686285
2019-06-02  0.387023 -2.452653
2019-06-03 -1.054974 -0.945219


In [31]:
# convert date into datetime format

date1 = datetime(2019, 6, 1, 0, 0, 0)
date2 = datetime(2019, 6, 3, 0, 0, 0)

print(df.loc[[date1, date2]])

                   A         B         C         D
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295


In [32]:
# specific row and column:

print(df.loc[date1, ['A','C']])

A   -0.185258
C   -0.686285
Name: 2019-06-01 00:00:00, dtype: float64


To summarize, if you want to extract a range of rows using their labels, you can simply using the following syntax: `df[start_label:end_label]`. If you want to extract specific rows or columns, use the loc indexer with the following syntax: `df.loc[[row_1_label, row_2_label,...row_n_label],[column_1_label, column_2_label,...column_n_label]]`.

In [37]:
# selecting a single cell in a df with .at() function

d = datetime(2019, 6, 3, 0, 0, 0)
print(df.at[d,'B'])

0.556775


In [38]:
df

Unnamed: 0,A,B,C,D
2019-05-25,0.187497,1.12215,-0.988277,-1.985934
2019-05-26,0.360803,-0.562243,-0.340693,-0.986988
2019-05-27,-0.040627,0.067333,-0.452978,0.686223
2019-05-28,-0.279572,-0.702492,0.252265,0.958977
2019-05-29,0.537438,-1.737568,0.714727,-0.939288
2019-05-30,0.070011,-0.516443,-1.655689,0.246721
2019-05-31,0.001268,0.951517,2.10736,-0.108726
2019-06-01,-0.185258,0.85652,-0.686285,1.104195
2019-06-02,0.387023,1.706336,-2.452653,0.260466
2019-06-03,-1.054974,0.556775,-0.945219,-0.030295


In [40]:
# Selecting based on cell value. Boolean indexing

print(df[(df['A']>0) & (df['B']>0)])

                   A         B         C         D
2019-05-25  0.187497  1.122150 -0.988277 -1.985934
2019-05-31  0.001268  0.951517  2.107360 -0.108726
2019-06-02  0.387023  1.706336 -2.452653  0.260466


In [46]:
# transforming dfs: reflect df over its main diagonal (converting columns to rows and rows to columns)

print(df.transpose())

print(df.T) # alternative

print(df)

   2019-05-25  2019-05-26  2019-05-27  2019-05-28  2019-05-29  2019-05-30  \
A    0.187497    0.360803   -0.040627   -0.279572    0.537438    0.070011   
B    1.122150   -0.562243    0.067333   -0.702492   -1.737568   -0.516443   
C   -0.988277   -0.340693   -0.452978    0.252265    0.714727   -1.655689   
D   -1.985934   -0.986988    0.686223    0.958977   -0.939288    0.246721   

   2019-05-31  2019-06-01  2019-06-02  2019-06-03  
A    0.001268   -0.185258    0.387023   -1.054974  
B    0.951517    0.856520    1.706336    0.556775  
C    2.107360   -0.686285   -2.452653   -0.945219  
D   -0.108726    1.104195    0.260466   -0.030295  
   2019-05-25  2019-05-26  2019-05-27  2019-05-28  2019-05-29  2019-05-30  \
A    0.187497    0.360803   -0.040627   -0.279572    0.537438    0.070011   
B    1.122150   -0.562243    0.067333   -0.702492   -1.737568   -0.516443   
C   -0.988277   -0.340693   -0.452978    0.252265    0.714727   -1.655689   
D   -1.985934   -0.986988    0.686223    0.958

In [47]:
# checking to see if a result is a dataframe or series:

def checkSeriesOrDataframe(var):
    if isintance(var, pd.DataFrame):
        return 'Dataframe'
    if isintance(var, pd.Series):
        return 'Series'

In [51]:
# Sorting data in a dataframe with 2 ways; sort by labels (axis) using sort_index() function and sort by value using
# sort_values() function

print(df.sort_index(axis=0, ascending=False)) # axis=0 means sort by index
print(df.sort_index(axis=1, ascending=False)) # axis=1 means sort by col labels (descending order)

                   A         B         C         D
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295
2019-06-02  0.387023  1.706336 -2.452653  0.260466
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-05-31  0.001268  0.951517  2.107360 -0.108726
2019-05-30  0.070011 -0.516443 -1.655689  0.246721
2019-05-29  0.537438 -1.737568  0.714727 -0.939288
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-26  0.360803 -0.562243 -0.340693 -0.986988
2019-05-25  0.187497  1.122150 -0.988277 -1.985934
                   D         C         B         A
2019-05-25 -1.985934 -0.988277  1.122150  0.187497
2019-05-26 -0.986988 -0.340693 -0.562243  0.360803
2019-05-27  0.686223 -0.452978  0.067333 -0.040627
2019-05-28  0.958977  0.252265 -0.702492 -0.279572
2019-05-29 -0.939288  0.714727 -1.737568  0.537438
2019-05-30  0.246721 -1.655689 -0.516443  0.070011
2019-05-31 -0.108726  2.107360  0.951517  0.001268
2019-06-01  1.104195 -0.686285 

In [54]:
# Sorting by value

print(df.sort_values('A', axis=0)) #sorts all values of a from least to greatest (ascending order)
print(df.sort_values('20190601', axis=1)) #sorts values by column from least to greatest


                   A         B         C         D
2019-06-03 -1.054974  0.556775 -0.945219 -0.030295
2019-05-28 -0.279572 -0.702492  0.252265  0.958977
2019-06-01 -0.185258  0.856520 -0.686285  1.104195
2019-05-27 -0.040627  0.067333 -0.452978  0.686223
2019-05-31  0.001268  0.951517  2.107360 -0.108726
2019-05-30  0.070011 -0.516443 -1.655689  0.246721
2019-05-25  0.187497  1.122150 -0.988277 -1.985934
2019-05-26  0.360803 -0.562243 -0.340693 -0.986988
2019-06-02  0.387023  1.706336 -2.452653  0.260466
2019-05-29  0.537438 -1.737568  0.714727 -0.939288
                   C         A         B         D
2019-05-25 -0.988277  0.187497  1.122150 -1.985934
2019-05-26 -0.340693  0.360803 -0.562243 -0.986988
2019-05-27 -0.452978 -0.040627  0.067333  0.686223
2019-05-28  0.252265 -0.279572 -0.702492  0.958977
2019-05-29  0.714727  0.537438 -1.737568 -0.939288
2019-05-30 -1.655689  0.070011 -0.516443  0.246721
2019-05-31  2.107360  0.001268  0.951517 -0.108726
2019-06-01 -0.686285 -0.185258 

In [61]:
# applying functions to a df with .apply function

sq_root = lambda x: math.sqrt(x) if x>0 else x
sq = lambda x: x**2

In [62]:
print(df['B'].apply(sq_root))

2019-05-25    1.059316
2019-05-26   -0.562243
2019-05-27    0.259486
2019-05-28   -0.702492
2019-05-29   -1.737568
2019-05-30   -0.516443
2019-05-31    0.975457
2019-06-01    0.925484
2019-06-02    1.306268
2019-06-03    0.746174
Freq: D, Name: B, dtype: float64


In [63]:
print(df.B.apply(sq))

2019-05-25    1.259221
2019-05-26    0.316117
2019-05-27    0.004534
2019-05-28    0.493495
2019-05-29    3.019143
2019-05-30    0.266713
2019-05-31    0.905385
2019-06-01    0.733627
2019-06-02    2.911583
2019-06-03    0.309998
Freq: D, Name: B, dtype: float64


In [66]:
# df.apply(sq_root) results in error
df.apply(sq) # can apply dataframe through sq function though

Unnamed: 0,A,B,C,D
2019-05-25,0.035155,1.259221,0.976691,3.943934
2019-05-26,0.130179,0.316117,0.116072,0.974145
2019-05-27,0.001651,0.004534,0.205189,0.470902
2019-05-28,0.078161,0.493495,0.063638,0.919637
2019-05-29,0.28884,3.019143,0.510835,0.882262
2019-05-30,0.004902,0.266713,2.741306,0.060871
2019-05-31,2e-06,0.905385,4.440966,0.011821
2019-06-01,0.034321,0.733627,0.470987,1.219247
2019-06-02,0.149787,2.911583,6.015507,0.067843
2019-06-03,1.11297,0.309998,0.893439,0.000918


In [67]:
for column in df:
    df[column] = df[column].apply(sq_root)
print(df)

                   A         B         C         D
2019-05-25  0.433009  1.059316 -0.988277 -1.985934
2019-05-26  0.600669 -0.562243 -0.340693 -0.986988
2019-05-27 -0.040627  0.259486 -0.452978  0.828386
2019-05-28 -0.279572 -0.702492  0.502260  0.979274
2019-05-29  0.733102 -1.737568  0.845415 -0.939288
2019-05-30  0.264596 -0.516443 -1.655689  0.496710
2019-05-31  0.035609  0.975457  1.451675 -0.108726
2019-06-01 -0.185258  0.925484 -0.686285  1.050807
2019-06-02  0.622112  1.306268 -2.452653  0.510359
2019-06-03 -1.054974  0.746174 -0.945219 -0.030295


In [68]:
# the apply function can be applied to eeither axis: index(0; apply function to each column) or column(1; apply function 
# to each row).

print(df.apply(np.sum, axis=0))

A    1.128665
B    1.753438
C   -4.722444
D   -0.185696
dtype: float64


In [69]:
print(df.apply(np.sum, axis=1))

2019-05-25   -1.481886
2019-05-26   -1.289255
2019-05-27    0.594267
2019-05-28    0.499470
2019-05-29   -1.098339
2019-05-30   -1.410826
2019-05-31    2.354015
2019-06-01    1.104747
2019-06-02   -0.013915
2019-06-03   -1.284314
Freq: D, dtype: float64


In [70]:
#adding and removing cols in a df

data = {'name': ['Janet', 'Nad', 'Timothy', 'June', 'Amy'],
       'year': [2012, 2012, 2013, 2014, 2014],
       'reports': [6,13,14,1,7]}
df = pd.DataFrame(data, index=
                 ['Singapore', 'China', 'Japan', 'Sweden', 'Norway'])
print(df)

              name  year  reports
Singapore    Janet  2012        6
China          Nad  2012       13
Japan      Timothy  2013       14
Sweden        June  2014        1
Norway         Amy  2014        7


In [72]:
# add schools col
schools = np.array(['Cambridge','Oxford','Oxford','Cambridge','Oxford'])
df['school'] = schools
print(df)

              name  year  reports     school
Singapore    Janet  2012        6  Cambridge
China          Nad  2012       13     Oxford
Japan      Timothy  2013       14     Oxford
Sweden        June  2014        1  Cambridge
Norway         Amy  2014        7     Oxford


In [74]:
# removing rows

print(df.drop(['China','Japan']))

            name  year  reports     school
Singapore  Janet  2012        6  Cambridge
Sweden      June  2014        1  Cambridge
Norway       Amy  2014        7     Oxford


In [75]:
# drop row named Nad

print(df[df.name != 'Nad'])

              name  year  reports     school
Singapore    Janet  2012        6  Cambridge
Japan      Timothy  2013       14     Oxford
Sweden        June  2014        1  Cambridge
Norway         Amy  2014        7     Oxford


In [76]:
# remove based on row number
print(df.drop(df.index[1]))

              name  year  reports     school
Singapore    Janet  2012        6  Cambridge
Japan      Timothy  2013       14     Oxford
Sweden        June  2014        1  Cambridge
Norway         Amy  2014        7     Oxford


In [78]:
print(df.drop(df.index[[1,2]]))

            name  year  reports     school
Singapore  Janet  2012        6  Cambridge
Sweden      June  2014        1  Cambridge
Norway       Amy  2014        7     Oxford


In [79]:
print(df.drop(df.index[-2]))

              name  year  reports     school
Singapore    Janet  2012        6  Cambridge
China          Nad  2012       13     Oxford
Japan      Timothy  2013       14     Oxford
Norway         Amy  2014        7     Oxford


In [80]:
# removing cols. drop reports col

print(df.drop('reports', axis=1))

              name  year     school
Singapore    Janet  2012  Cambridge
China          Nad  2012     Oxford
Japan      Timothy  2013     Oxford
Sweden        June  2014  Cambridge
Norway         Amy  2014     Oxford


In [81]:
print(df.drop(df.columns[1],axis=1)) #drop using columns number

              name  reports     school
Singapore    Janet        6  Cambridge
China          Nad       13     Oxford
Japan      Timothy       14     Oxford
Sweden        June        1  Cambridge
Norway         Amy        7     Oxford


In [82]:
# drop multiple cols

print(df.drop(df.columns[[1,3]], axis=1))

              name  reports
Singapore    Janet        6
China          Nad       13
Japan      Timothy       14
Sweden        June        1
Norway         Amy        7


In [83]:
# generating a crosstab:

df = pd.DataFrame(
{
    'Gender': ['Male', 'Male', 'Female', 'Female', 'Female'],
    'Team': [1,2,3,3,1]
})
print(df)

   Gender  Team
0    Male     1
1    Male     2
2  Female     3
3  Female     3
4  Female     1


In [84]:
print("displaying the distribution of genders in each team")
print(pd.crosstab(df.Gender, df.Team))

displaying the distribution of genders in each team
Team    1  2  3
Gender         
Female  1  0  2
Male    1  1  0
