# The `Series` Datastructure

In [1]:
import pandas as pd

In [10]:

students=['Abderrahim',"Alice","Anna"]

serie=pd.Series(students)
serie

0    Abderrahim
1         Alice
2          Anna
dtype: object

In [15]:
number=[1,2,3]
for i in pd.Series(number):
    print(i)


#  panda stores series values in a 
# typed array using the Numpy library
# This offers significant speedup when processing data 
# versus traditional python lists.

1
2
3


In [17]:
students=['Abderrahim','hassan',None]

pd.Series(students)


0    Abderrahim
1        hassan
2          None
dtype: object

In [19]:
# , if we create a list of numbers, integers or floats, and put in the None type,
# pandas automatically converts this to a special floating point value designated as NaN, 
# which stands for "Not a Number".

numbers=[1,2,None]
pd.Series(numbers)
# pandas represents NaN as a floating point number

0    1.0
1    2.0
2    NaN
dtype: float64

In [21]:
# NaN is *NOT* equivilent to None and when we try the equality test, the result is False.
import numpy as np
np.nan==None
# It turns out that you actually can't do an equality test of NAN to itself. When you do, 
# the answer is always False. 
np.nan == np.nan
# Instead, you need to use special functions to test for the presence of not a number, 
# such as the Numpy library isnan().

np.isnan(np.nan)

np.True_

In [4]:
record1=pd.Series({
    'name':'Alice',
    'class':"Physics",
    'score':85
})
record2=pd.Series({
    'name':'Abderrahim',
    'class':"Math",
    'score':65
})
record3=pd.Series({
    'name':'Anna',
    'class':"English",
    'score':78
})

In [23]:
students=[("alice","Manira"),("Mohamed",'Bagera'),("Molly","Noly")]
pd.Series(students)

# We see that each of the tuples is stored in the series object, and the type is object.

0      (alice, Manira)
1    (Mohamed, Bagera)
2        (Molly, Noly)
dtype: object

In [24]:
s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [26]:
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
# When I create the series object though I'll only ask for an index with three students, and
# I'll exclude Jack
s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [22]:
record1.index

Index(['name', 'class', 'score'], dtype='object')

# Querying `Series`


In [28]:
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}
s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [33]:
s.iloc[0]
s.loc['Alice']

'Physics'

In [34]:
s[3]

  s[3]


'History'

In [35]:
s['Molly']

'English'

In [36]:
# So what happens if your index is a list of integers? This is a bit complicated and Pandas can't 
# determine automatically whether you're intending to query by index position or index label. So 
# you need to be careful when using the indexing operator on the Series itself. The safer option 
# is to be more explicit and use the iloc or loc attributes directly.

class_code = {99: 'Physics',
              100: 'Chemistry',
              101: 'English',
              102: 'History'}
s = pd.Series(class_code)

In [39]:
# s[0]#it gives an errror its better to use the iloc
# the pandas can't decide if its a number of the indexes or an opperatoer
s.iloc[0]

'Physics'

In [44]:
grades=[90,20,85,65]

total=0
for grade in grades:
    total+=grade
print(total/len(grades))
# This works, but it's slow. Modern computers can do many tasks simultaneously, especially, 
# but not only, tasks involving mathematics.

65.0


In [45]:
total = np.sum(grades)
print(total/len(grades))

65.0


In [49]:
numbers=pd.Series(np.random.randint(0,1000,10000))
numbers.head()
# len(numbers) #10000

10000

In [54]:
%%timeit -n 100
total =0
for number in numbers:
    total+=number
total/len(numbers)
# 4.3 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


3.27 ms ± 408 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [53]:
%%timeit -n 100
total=np.sum(numbers)
total/len(numbers)
# 312 μs ± 123 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

312 μs ± 123 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# Wow! This is a pretty shocking difference in the speed and demonstrates why one should be 
# aware of parallel computing features and start thinking in functional programming terms.
# Put more simply, vectorization is the ability for a computer to execute multiple instructions
# at once, and with high performance chips, especially graphics cards, you can get dramatic
# speedups. Modern graphics cards can run thousands of instructions in parallel.