In [1]:
import numpy as np
import pandas as pd

In [2]:
my_series = pd.Series( data = [2,3,5,4],             # Data
                       index= ['a', 'b', 'c', 'd'])  # Indexes

my_series

a    2
b    3
c    5
d    4
dtype: int64

In [3]:
my_dict = {"x": 2, "a": 5, "b": 4, "c": 8}

my_series2 = pd.Series(my_dict)

my_series2 

x    2
a    5
b    4
c    8
dtype: int64

In [4]:
my_series["a"]


2

In [7]:
my_series.iloc[0]

2

In [8]:
my_series[1:3]

b    3
c    5
dtype: int64

In [9]:
my_series + my_series

a     4
b     6
c    10
d     8
dtype: int64

In [10]:
my_series + my_series2

a     7.0
b     7.0
c    13.0
d     NaN
x     NaN
dtype: float64

In [11]:
np.mean(my_series)        # numpy array functions generally work on series

3.5

In [12]:
# Create a dictionary with some different data types as values

my_dict = {"name" : ["Joe","Bob","Frans"],
           "age" : np.array([10,15,20]),
           "weight" : (75,123,239),
           "height" : pd.Series([4.5, 5, 6.1], 
                                index=["Joe","Bob","Frans"]),
           "siblings" : 1,
           "gender" : "M"}

df = pd.DataFrame(my_dict)   # Convert the dict to DataFrame

df                           # Show the DataFrame

Unnamed: 0,name,age,weight,height,siblings,gender
Joe,Joe,10,75,4.5,1,M
Bob,Bob,15,123,5.0,1,M
Frans,Frans,20,239,6.1,1,M


In [13]:
my_dict2 = {"name" : ["Joe","Bob","Frans"],
           "age" : np.array([10,15,20]),
           "weight" : (75,123,239),
           "height" :[4.5, 5, 6.1],
           "siblings" : 1,
           "gender" : "M"}

df2 = pd.DataFrame(my_dict2)   # Convert the dict to DataFrame

df2     

Unnamed: 0,name,age,weight,height,siblings,gender
0,Joe,10,75,4.5,1,M
1,Bob,15,123,5.0,1,M
2,Frans,20,239,6.1,1,M


In [14]:
df2 = pd.DataFrame(my_dict2,
                   index = my_dict["name"] )

df2

Unnamed: 0,name,age,weight,height,siblings,gender
Joe,Joe,10,75,4.5,1,M
Bob,Bob,15,123,5.0,1,M
Frans,Frans,20,239,6.1,1,M


In [15]:
# Get a column by name

df2["weight"]

Joe       75
Bob      123
Frans    239
Name: weight, dtype: int64

In [16]:
df2.weight

Joe       75
Bob      123
Frans    239
Name: weight, dtype: int64

In [17]:
# Delete a column

del df2['name']

In [18]:
# Add a new column

df2["IQ"] = [130, 105, 115]

df2

Unnamed: 0,age,weight,height,siblings,gender,IQ
Joe,10,75,4.5,1,M,130
Bob,15,123,5.0,1,M,105
Frans,20,239,6.1,1,M,115


In [19]:
df2["Married"] = False

df2

Unnamed: 0,age,weight,height,siblings,gender,IQ,Married
Joe,10,75,4.5,1,M,130,False
Bob,15,123,5.0,1,M,105,False
Frans,20,239,6.1,1,M,115,False


In [20]:
df2["College"] = pd.Series(["Harvard"],
                           index=["Frans"])

df2

Unnamed: 0,age,weight,height,siblings,gender,IQ,Married,College
Joe,10,75,4.5,1,M,130,False,
Bob,15,123,5.0,1,M,105,False,
Frans,20,239,6.1,1,M,115,False,Harvard


In [21]:
df2.loc["Joe"]          # Select row "Joe"

age            10
weight         75
height        4.5
siblings        1
gender          M
IQ            130
Married     False
College       NaN
Name: Joe, dtype: object

In [22]:
df2.loc["Joe","IQ"]     # Select row "Joe" and column "IQ"

130

In [23]:
df2.loc["Joe":"Bob" , "IQ":"College"]   # Slice by label

Unnamed: 0,IQ,Married,College
Joe,130,False,
Bob,105,False,


In [24]:
df2.iloc[0]          # Get row 0

age            10
weight         75
height        4.5
siblings        1
gender          M
IQ            130
Married     False
College       NaN
Name: Joe, dtype: object

In [25]:
df2.iloc[0, 5]       # Get row 0, column 5

130

In [26]:
df2.iloc[0:2, 5:8]   # Slice by numeric row and column index

Unnamed: 0,IQ,Married,College
Joe,130,False,
Bob,105,False,


In [27]:
boolean_index = [False, True, True]  

df2[boolean_index] 

Unnamed: 0,age,weight,height,siblings,gender,IQ,Married,College
Bob,15,123,5.0,1,M,105,False,
Frans,20,239,6.1,1,M,115,False,Harvard


In [28]:
# Create a boolean sequence with a logical comparison
boolean_index = df2["age"] > 12

# Use the index to get the rows where age > 12
df2[boolean_index]

Unnamed: 0,age,weight,height,siblings,gender,IQ,Married,College
Bob,15,123,5.0,1,M,105,False,
Frans,20,239,6.1,1,M,115,False,Harvard


In [29]:
df2[ df2["age"] > 12 ]

Unnamed: 0,age,weight,height,siblings,gender,IQ,Married,College
Bob,15,123,5.0,1,M,105,False,
Frans,20,239,6.1,1,M,115,False,Harvard


In [35]:
titanic_train = pd.read_csv("train.csv")

type(titanic_train)

pandas.core.frame.DataFrame

In [36]:
titanic_train.shape      # Check dimensions

(891, 12)

In [37]:
titanic_train.head(6)    # Check the first 6 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [38]:
titanic_train.tail(6)   # Check the last 6 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [39]:
titanic_train.index = titanic_train["Name"]  # Set index to name
del titanic_train["Name"]                    # Delete name column

print(titanic_train.index[0:10])             # Print new indexes

Index(['Braund, Mr. Owen Harris',
       'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
       'Heikkinen, Miss. Laina',
       'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
       'Allen, Mr. William Henry', 'Moran, Mr. James',
       'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard',
       'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
       'Nasser, Mrs. Nicholas (Adele Achem)'],
      dtype='object', name='Name')


In [40]:
titanic_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [41]:
titanic_train.describe()    # Summarize the first 6 columns

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [42]:
np.mean(titanic_train,
        axis=0)          # Get the mean of each numeric column

TypeError: Could not convert ['malefemalefemalefemalemalemalemalemalefemalefemalefemalefemalemalemalefemalefemalemalemalefemalefemalemalemalefemalemalefemalefemalemalemalefemalemalemalefemalefemalemalemalemalemalemalefemalefemalefemalefemalemalefemalefemalemalemalefemalemalefemalemalemalefemalefemalemalemalefemalemalefemalemalemalefemalemalemalemalemalefemalemalefemalemalemalefemalemalemalemalemalemalemalemalefemalemalemalefemalemalefemalefemalemalemalefemalemalemalemalemalemalemalemalemalemalefemalemalefemalemalemalemalemalemalefemalemalemalefemalemalefemalemalefemalefemalemalemalemalemalefemalemalemalemalefemalemalemalemalemalefemalemalemalemalefemalefemalemalemalefemalemalemalemalefemalefemalefemalemalemalemalemalefemalemalemalemalefemalemalemalemalemalefemalemalemalemalemalefemalemalemalemalemalefemalefemalemalemalemalemalefemalemalemalemalemalefemalemalemalefemalemalemalemalefemalemalefemalemalemalemalefemalemalefemalemalefemalefemalemalemalefemalefemalemalemalemalemalemalefemalemalemalefemalemalemalefemalemalemalemalefemalefemalemalefemalemalemalemalemalemalemalemalemalemalemalefemalefemalemalemalefemalemalefemalemalefemalemalemalefemalefemalemalemalemalemalefemalefemalemalemalemalefemalemalemalefemalefemalefemalefemalefemalefemalemalemalemalemalefemalemalemalemalefemalefemalemalemalefemalemalefemalefemalefemalemalemalefemalemalemalemalemalemalemalemalemalemalefemalefemalefemalemalefemalemalemalemalefemalemalefemalefemalemalemalefemalemalemalefemalefemalemalefemalefemalefemalefemalemalemalefemalefemalemalefemalefemalemalemalefemalefemalemalefemalemalefemalefemalefemalefemalemalemalemalefemalemalemalefemalemalemalemalefemalemalemalemalefemalefemalefemalemalemalemalemalemalemalemalemalefemalefemalefemalefemalemalemalefemalemalemalemalefemalefemalefemalefemalemalemalemalemalefemalefemalefemalemalemalemalefemalefemalemalefemalemalemalemalefemalemalefemalemalemalemalefemalefemalemalefemalemalemalefemalemalemalefemalemalefemalemalemalemalemalefemalemalemalefemalemalemalefemalefemalefemalemalefemalemalemalemalefemalemalemalefemalefemalemalemalemalefemalefemalemalemalefemalefemalefemalemalemalefemalemalemalefemalemalemalefemalemalefemalemalemalemalemalemalemalemalemalefemalefemalemalemalemalemalemalemalemalemalemalemalefemalemalemalefemalefemalefemalemalemalemalemalefemalemalemalemalefemalemalefemalefemalemalemalemalemalemalemalemalemalemalefemalemalefemalemalemalefemalefemalefemalefemalemalefemalemalemalemalemalemalemalefemalemalemalefemalemalefemalemalefemalemalemalefemalemalemalefemalemalemalemalefemalemalemalefemalefemalefemalemalefemalemalefemalefemalefemalefemalemalemalemalefemalemalemalemalemalemalemalemalefemalemalefemalemalefemalefemalemalemalemalemalefemalemalemalefemalemalemalemalefemalemalefemalemalemalefemalefemalefemalemalefemalefemalemalemalemalefemalemalemalemalemalemalefemalemalefemalemalemalefemalemalemalemalefemalemalemalemalemalemalemalemalefemalefemalefemalemalefemalemalemalefemalemalefemalefemalemalemalemalemalemalemalemalemalefemalemalemalemalemalemalemalefemalefemalemalemalefemalemalemalefemalefemalemalefemalemalemalemalemalefemalemalefemalemalefemalefemalemalemalefemalemalemalemalemalemalemalemalemalemalemalemalefemalefemalemalemalemalemalemalemalefemalefemalemalefemalemalemalemalemalemalemalemalemalefemalemalefemalemalemalemalemalemalefemalemalemalefemalemalefemalemalemalemalefemalemalefemalemalefemalemalemalemalemalemalefemalefemalemalemalefemalemalemalemalemalemalefemalefemalemalefemalefemalemalemalemalemalemalefemalemalemalemalemalemalefemalemalemalemalemalefemalemalemalefemalemalemalemalefemalemalemalemalemalefemalemalemalemalefemalemalefemalemalefemalemalemalemalemalefemalemalefemalemalemalefemalemalefemalefemalefemalemalemalemalemalefemalemalemalemalemalemalefemalemalemalemalefemalefemalemalefemalemalefemalemalemalemalemalemalefemalemalefemalemalemalemalefemalemalemalefemalemalemalemalefemalemalemalefemalemalemalemalemalemalefemalefemalemalemalemalemalefemalemalemalemalemalemalemalefemalemalemalemalemalemalemalefemalemalemalefemalefemalefemalefemalefemalemalefemalemalemalemalefemalefemalemalefemalefemalemalemalemalemalefemalemalemalefemalefemalemalemalemalefemalefemalemalefemalemalemalefemalemalefemalefemalemalemale'] to numeric

In [43]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, Braund, Mr. Owen Harris to Dooley, Mr. Patrick
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.5+ KB


In [None]:
draft = pd.read_excel('input/draft2015/draft2015.xlsx', # Path to Excel file
                     sheet_name = 'draft2015')         # Name of sheet to read from

draft.head(6)                            # Check the first 6 rows

In [None]:
draft.to_csv("draft_saved.csv") 


In [44]:
import numpy as np

# Draw 25 random numbers from -1 to 1
my_data = np.random.uniform(-1,1,25)  

for index, number in enumerate(my_data):  
    if number < 0:               
        my_data[index] = 0            # Set numbers less than 0 to 0

print(my_data)

[0.         0.03634353 0.         0.         0.73902953 0.
 0.         0.95723673 0.0223512  0.10620522 0.         0.29259459
 0.         0.07971707 0.         0.26313577 0.2379698  0.56604299
 0.         0.36770022 0.         0.55442482 0.63005991 0.55770916
 0.93755171]


In [45]:
my_data = np.random.uniform(-1,1,25)  # Generate new random numbers

my_data = np.where(my_data < 0,       # A logical test
                   0,                 # Value to set if the test is true
                   my_data)           # Value to set if the test is false

print(my_data)

[0.18629346 0.         0.         0.28610913 0.80673978 0.
 0.         0.61945887 0.27289153 0.56470188 0.         0.
 0.         0.88377041 0.66495513 0.         0.         0.
 0.3246557  0.         0.         0.60591246 0.97396294 0.
 0.05168612]
