<a href="https://colab.research.google.com/github/christine-palamara/codespaces-jupyter/blob/main/notebook_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Necessary Libraries and Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns

## Ignore Warnings and Remove (or set) Limitations (*optional)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# remove or set limitations for number of columns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)

### **Google Colab** or **Jupyter Notebook?**

*   If using **Google Colab**, first mount the Google Drive, then set the path of the directory which contains the data file.
*   If using **Jupyter Notebook**, make sure the data file is uploaded to the same directory in which the current Notebook is located.

In [None]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# set path for the Google Drive which contains the data file
path = "/content/drive/MyDrive/MIT/3_Machine_Learning/"

# set the name of the data file
csv_file="Advertising.csv"

# set the full path of the file to be imported
file_path = path + csv_file

# load the file
df = pd.read_csv(file_path)

Mounted at /content/drive


In [None]:
# Run this code block if using Jupyter Notebook
# set the name of the data file
file_path="Advertising.csv"

# load the file
df = pd.read_csv(file_path)

## Run some basic data observations

In [None]:
# get the row count, columns, data types, non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  200 non-null    int64  
 1   TV          200 non-null    float64
 2   Radio       200 non-null    float64
 3   Newspaper   200 non-null    float64
 4   Sales       200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [None]:
# show the top 5 rows (default)
df.head()
# or indicate the number of rows
df.head(100)

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
5,6,8.7,48.9,75.0,7.2
6,7,57.5,32.8,23.5,11.8
7,8,120.2,19.6,11.6,13.2
8,9,8.6,2.1,1.0,4.8
9,10,199.8,2.6,21.2,10.6


In [None]:
# transpose the data and get count, mean, std, min, 25%, 50%, 75%, max
df.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,200.0,100.5,57.879185,1.0,50.75,100.5,150.25,200.0
TV,200.0,147.0425,85.854236,0.7,74.375,149.75,218.825,296.4
Radio,200.0,23.264,14.846809,0.0,9.975,22.9,36.525,49.6
Newspaper,200.0,30.554,21.778621,0.3,12.75,25.75,45.1,114.0
Sales,200.0,14.0225,5.217457,1.6,10.375,12.9,17.4,27.0


## Back up the dataset; drop unnecessary column(s)

In [None]:
# make a backup copy of the original dataset before making structural changes
df_orig = df.copy()
# drop the column index, and commit
df.drop(columns = ['Unnamed: 0'], inplace = True)
# validate the drop
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## New Section