In [334]:
#how to create a series from a list, numpy array and dict?
import numpy as np
import pandas as pd
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser3.head())

a    0
b    1
c    2
e    3
d    4
dtype: int32


In [335]:
# how to convert the index of a series into a columns of a dataframes?
ser = pd.Series(mydict)
df = ser.to_frame().reset_index()
df.head()

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


In [336]:
# how to combine many series to from a dataframe? 
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))
# Solution 1
df = pd.concat([ser1, ser2], axis=1)
df.head()
# Solution 2 
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
df.head()

Unnamed: 0,col1,col2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


In [337]:
# How to assign name to the series index?
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

In [338]:
# How to get items of series A not present in series B?
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

In [339]:
 # How to get the items not common to both series A and series B?
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser_u = pd.Series(np.union1d(ser1, ser2)) #union
ser_i = pd.Series(np.intersect1d(ser1, ser2)) #intersect
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [340]:
# How to get the minimum, 25th percentile, median, 75th, and max of numeric series?
ser = pd.Series(np.random.normal(10, 5, 25))
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 1.25117263,  7.70986507, 10.92259345, 13.36360403, 18.0949083 ])

In [341]:
# How to get frequency counts of unique items of a series?
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size = 30)))
ser.value_counts()

c    6
f    6
d    6
b    4
g    3
e    2
h    2
a    1
dtype: int64

In [342]:
# How to keep only top 2 most frequent values as it is and replace everything else as 'Other'?
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
print('Top 2 Freq:', ser.value_counts())
ser[~ser.isin(ser.value_counts().index[0:2])] = 'Other'
ser

Top 2 Freq: 2    4
4    3
1    3
3    2
dtype: int64


0     Other
1         4
2         2
3     Other
4     Other
5         2
6         2
7         4
8     Other
9     Other
10        2
11        4
dtype: object

In [343]:
# How to bin a numeric series to 10 groups of equal size?
ser = pd.Series(np.random.random(20))
print(ser.head())
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1],
        labels = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th','9th', '10th']).head()

0    0.750558
1    0.247220
2    0.152308
3    0.089280
4    0.373055
dtype: float64


0    7th
1    3rd
2    2nd
3    1st
4    4th
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

In [344]:
# How to convert a numpy array to a dataframe of given shape
ser = pd.Series(np.random.randint(1, 10, 35))
df = pd.DataFrame(ser.values.reshape(7,5))
df

Unnamed: 0,0,1,2,3,4
0,4,3,4,2,8
1,5,5,3,2,5
2,3,5,2,3,3
3,5,9,4,8,4
4,1,4,4,5,2
5,2,4,2,4,4
6,7,6,7,5,5


In [345]:
# How to find the positions of numbers that are multiples of 3 from a series?
ser = pd.Series(np.random.randint(1, 10, 7))
ser
#np.argwhere(ser % 3 == 0)
# i can't solve the problem, i will look again later.



0    5
1    4
2    2
3    2
4    6
5    8
6    7
dtype: int32

In [346]:
# How to extract items at given position from a series
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

In [347]:
# How to stack two series vertically and horizontally?
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
# Vertical
ser1.append(ser2)
# Horizontal
df = pd.concat([ser1, ser2], axis=1)
df

  ser1.append(ser2)


Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [348]:
# How to get positions of items of series A in another series B?
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])
# Solution 1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]
# Solution 2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

In [349]:
# How to compute the mean squared error on an truth and predicted series?
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
np.mean((truth-pred)**2)

0.2625584881273747

In [350]:
# How to calculate the number of characters in each word in a series?
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser.map(lambda x: len(x))


0    3
1    2
2    4
3    4
dtype: int64

In [351]:
# How to compute difference of differences between consequtive numbers of a series?
ser = pd.Series([1, 3 , 6, 9, 15, 21, 27, 35])
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 3.0, 6.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 0.0, 3.0, 0.0, 0.0, 2.0]


In [352]:
# How to convert a series of date-string to a timeseries?
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
# Solution 1
from dateutil.parser import parse
ser.map(lambda x: parse(x))
# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [353]:
# How to get the day of month, week number, day of year and day of week from a series of date strings?
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))
# day of month
print('Date: ', ser_ts.dt.day.tolist())
# week number
print('Week number: ', ser_ts.dt.weekofyear.tolist())
# day of year
print('Day number of year: ', ser_ts.dt.dayofyear.tolist())
# day of week
print('Day of week: ', ser_ts.dt.day_of_week.tolist())
# i can't solve 'day of week', i will look again later.



Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  [4, 2, 5, 3, 0, 5]


  print('Week number: ', ser_ts.dt.weekofyear.tolist())


In [354]:
# How to convert year-month string to dates corresponding to the 4th day of the month?
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
# Solution 1
from dateutil.parser import parse
# Parse the date
ser_ts = ser.map(lambda x: parse(x))
# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str')+ '-' + '04'
# Format it.
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]
# Solution 2
ser.map(lambda x: parse('04' + x))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

In [355]:
# How to filter words that contaion atleast 2 vowels from a series?
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0)for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

In [356]:
# How to filter valid emails from a series ?
emails = pd.Series(['buying book at amozom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
#pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
# Solution 1
import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]
# Solution 2
emails.str.findall(pattern, flags=re.IGNORECASE)
# Solution 3
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

In [357]:
# How to get the mena of a series grouped by another series?
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weight = pd.Series(np.linspace(1, 10, 10))
print(weight.tolist())
print(fruit.tolist())
weight.groupby(fruit).mean()


[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['apple', 'carrot', 'banana', 'carrot', 'carrot', 'carrot', 'carrot', 'carrot', 'banana', 'banana']


apple     1.000000
banana    7.333333
carrot    5.333333
dtype: float64

In [358]:
# How to compute the euclidean distance between two series?
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
# Solution
sum((p - q)**2)**.5
# Solution(using func)
np.linalg.norm(p-q)

18.16590212458495

In [359]:
# How to find all the local maxina(or peaks) in a numeric series?
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7], dtype=int64)

In [360]:
# How to replace missing spaces in a string with the least frequent character?
my_str = 'dbc deb abed gade'
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

d    4
b    3
     3
e    3
a    2
c    1
g    1
dtype: int64


'dbcgdebgabedggade'

In [361]:
# How to create a TimeSeris starting '2000-01-01' and 10 weekends(saturdays) after that having random numbers as values?
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq = 'W-SAT'))
ser

2000-01-01    7
2000-01-08    3
2000-01-15    5
2000-01-22    3
2000-01-29    9
2000-02-05    7
2000-02-12    3
2000-02-19    3
2000-02-26    2
2000-03-04    1
Freq: W-SAT, dtype: int32