In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv"
df = pd.read_csv(url)
df.isna().sum()

row_n           0
id              1
name            0
gender          0
species         0
birthday        0
personality     0
song           11
phrase          0
full_id         0
url             0
dtype: int64

In [None]:
# There is one missing value in 'id' and 11 missing values in 'song.'

In [2]:
# Get the shape of the DataFrame
df.shape

(391, 11)

In [None]:
# There are 391 rows and 11 columns.
# An observation seems to be the individual/thing that is being studied, while a variable seems to be a certain measurable trait of the observation.

In [3]:
# Summary for all columns (including categorical)
df.describe(include='object')

Unnamed: 0,id,name,gender,species,birthday,personality,song,phrase,full_id,url
count,390,391,391,391,391,391,380,391,391,391
unique,390,391,2,35,361,8,92,388,391,391
top,admiral,Admiral,male,cat,1-27,lazy,K.K. Country,wee one,villager-admiral,https://villagerdb.com/images/villagers/thumb/...
freq,1,1,204,23,2,60,10,2,1,1


In [4]:
import pandas as pd
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df = pd.read_csv(url)
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [5]:
df.shape

(891, 15)

In [7]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# 5a) df.describe() only summarizes numerical columns. Some values of "age" are missing, which is why the count value is less than the total amount of rows.
# 5b) An attribute is used to directly retrieve stored information while a method is used to perform an action.

# 6. count - the amount of values in a given column excluding missing values.
# mean - the average value in a given column.
# std - the amount of variation in a given column.
# min/max - the minimum/maximum value in a given column.
# 25%/50%/75% - quartile 1, median, and quartile 3; the value of a given column in which 25%/50%/75% of the values in the column are lesser.

# 7a df.dropna() should be used if there is missing data for a vital column. In the Titanic dataset, the age of many passengers is unknown. However, the age column can provide much useful information, so I wouldn't want to remove the entire column.
# 7b) del df['col'] should be used if I want to remove a column irrelevant to the topic I'm analyzing. In the Titanic dataset, I might want to see if there is any correlation between survivorship rate and passenger class. In this situation the other columns besides "survived" and "pclass" wouldn't be necessary to keep.
# 7c) If a column has many missing values, I would unnecessarily remove a lot of data by using df.dropna() first.

In [1]:
del df['deck']  # Step 1: Remove the 'deck' column
df_cleaned = df.dropna()  # Step 2: Drop rows with any missing values

NameError: name 'df' is not defined

In [3]:
import pandas as pd

# Load the Titanic dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df = pd.read_csv(url)

# Remove 'deck' column and drop rows with missing values
del df['deck']
df_cleaned = df.dropna()

# Check the result
print(df_cleaned.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male  embark_town alive  alone  
0    man        True  Southampton    no  False  
1  woman       False    Cherbourg   yes  False  
2  woman       False  Southampton   yes   True  
3  woman       False  Southampton   yes  False  
4    man        True  Southampton    no   True  


In [5]:
# Check for missing values
print(df_cleaned.isna().sum())

# Get the summary of the cleaned dataset
print(df_cleaned.describe())

print(df_cleaned.shape)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64
         survived      pclass         age       sibsp       parch        fare
count  712.000000  712.000000  712.000000  712.000000  712.000000  712.000000
mean     0.404494    2.240169   29.642093    0.514045    0.432584   34.567251
std      0.491139    0.836854   14.492933    0.930692    0.854181   52.938648
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    1.000000   20.000000    0.000000    0.000000    8.050000
50%      0.000000    2.000000   28.000000    0.000000    0.000000   15.645850
75%      1.000000    3.000000   38.000000    1.000000    1.000000   33.000000
max      1.000000    3.000000   80.000000    5.000000    6.000000  512.329200
(712, 14)


In [None]:
# Since an overwhelming majority of the deck column has missing values, I removed the entire column, then cleaned up the rest of the missing values. 1 column and 179 rows were removed.

In [14]:
import pandas as pd
url = "https://raw.githubusercontent.com/Opensourcefordatascience/Data-sets/master/admission.csv"
df = pd.read_csv(url)
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [16]:
# df.groupby("col1")["col2"].describe() will split the data into two groups--one with students who were admitted into the program and one with students who were not--and then provide information for each group based on the GRE scores.
df.groupby("admit")["gre"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,273.0,573.186813,115.830243,220.0,500.0,580.0,660.0,800.0
1.0,127.0,618.897638,108.884884,300.0,540.0,620.0,680.0,800.0


In [None]:
# There are no missing values in this dataset, however, the count values produced by df.groupby("col1")["col2"].describe() and df.describe() will differ because df.groupby("col1")["col2"].describe() first groups the data based on the unique values in column 1, which may indicate whether there is a lack of data for specific groups.

In [1]:
url = "https://raw.githubusercontent.com/Opensourcefordatascience/Data-sets/master/admission.csv"
df = pd.read_csv(url)
df.describe()

NameError: name 'pd' is not defined

In [None]:
# I could solve the issue easily using both ChatGPT and Google, but it was easier to simply copy and paste the full error into ChatGPT, then copy the correct code it gave back to me.

In [2]:
import pandas as pd
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanics.csv"
df = pd.read_csv(url)
df.describe()

HTTPError: HTTP Error 404: Not Found

In [None]:
# Again I could correct this instantly using both ChatGPT and Google.

In [4]:
import pandas as pd
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df = pd.read_csv(url)
DF.groupby("col1")["col2"].describe()

NameError: name 'DF' is not defined

In [None]:
# ChatGPT was able to solve the issue right away. I wasn't able to find a solution on Google as the search was not case-sensitive.

In [5]:
pd.read_csv(url

SyntaxError: incomplete input (4098100527.py, line 1)

In [None]:
# ChatGPT provided the answer instantly, but I had to search a bit on Google. However, Google provided a more thorough answer by explaining the meaning of each piece of the error, whereas ChatGPT only gave me the correct code to copy.

In [6]:
df.group_by("col1")["col2"].describe()

AttributeError: 'DataFrame' object has no attribute 'group_by'

In [None]:
# Both ChatGPT and Google were able to solve this instantly.

In [8]:
df.groupby("Sex")["age"].describe()

KeyError: 'Sex'

In [None]:
# Both ChatGPT and Google were able to solve this instantly.

In [9]:
df.groupby(sex)["age"].describe()

NameError: name 'sex' is not defined

In [10]:
# ChatGPT did solve the error, but I couldn't find it on Google. ChatGPT was able to solve every error I gave right away. Overall it's much easier to use a chatbot than Google to solve an issue, but takes away much of the actual learning.
# I didn't read the wiki-textbook; the chatbot sufficed to help me understand the material I was missing.

In [None]:
https://chatgpt.com/share/66e3a60d-df28-800e-9145-f157aea5eb8a
CHATGPT SUMMARY 2

Initial Error: You encountered a NameError when trying to use DF.groupby("col1")["col2"].describe(). The issue was that the DataFrame DF was not defined. The correct DataFrame name is df.

Syntax Error: You then faced a SyntaxError due to a missing parenthesis in pd.read_csv(url.

AttributeError: After correcting the parenthesis issue, you received an AttributeError because you used group_by instead of groupby.

KeyError: When using df.groupby("Sex")["age"].describe(), you encountered a KeyError because the column 'Sex' did not exist in the DataFrame. The correct column names needed to be checked using df.columns.

NameError (again): Finally, you faced a NameError because sex was not in quotes, so it was interpreted as a variable rather than a column name.

Solution Steps:

Correct the DataFrame name from DF to df.
Use the correct method name groupby.
Ensure column names are correctly referenced by checking with df.columns and using exact names in quotes.
Let me know if you need further clarification or assistance!