In [None]:
import os
import pandas as pd

# Folder where the script is located (this is adjusted for .ipynb environments)
script_dir = os.getcwd()

# CSV file name
csv_file_name = "QueryResults.csv"

# Full path to the CSV file
csv_path = os.path.join(script_dir, csv_file_name)

# Check if the file exists
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"CSV file not found at: {csv_path}")

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path, names=['DATE', 'TAG', 'POSTS'], header=0)


now that we Have succesfully loaded the csv, the next step is to begin our data exploration journey

In [None]:
# Display the last few rows of the DataFrame
df.tail()

Unnamed: 0,DATE,TAG,POSTS
1986,2020-07-01 00:00:00,r,5694
1987,2020-07-01 00:00:00,go,743
1988,2020-07-01 00:00:00,ruby,775
1989,2020-07-01 00:00:00,perl,182
1990,2020-07-01 00:00:00,swift,3607


In [None]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,DATE,TAG,POSTS
0,2008-07-01 00:00:00,c#,3
1,2008-08-01 00:00:00,assembly,8
2,2008-08-01 00:00:00,javascript,162
3,2008-08-01 00:00:00,c,85
4,2008-08-01 00:00:00,python,124


first we understood that there are 1991 rows in the df, (from df.shape()). 
Then (df.count()), which displays the numbers of non-NaN entries in each column of the dataframe
which all three columns returned 1991, that tells us there are no NaN entries in the current df we operate in.

In [None]:
# Display the shape of the DataFrame
df.shape

(1991, 3)

In [None]:
# Display the count of non-null entries in each column
df.count()

DATE     1991
TAG      1991
POSTS    1991
dtype: int64

A quick summary by type, one for the amount of posts, and one for amount of entries in the df.

In [15]:
# Group by 'TAG' and sum the 'POSTS'
df.groupby('TAG')['POSTS'].sum() 

TAG
assembly        34852
c              336042
c#            1423530
c++            684210
delphi          46212
go              47499
java          1696403
javascript    2056510
perl            65286
php           1361988
python        1496210
r              356799
ruby           214582
swift          273055
Name: POSTS, dtype: int64

In [None]:
# Group by 'TAG' and count the number of entries
df.groupby('TAG').count()

Unnamed: 0_level_0,DATE,POSTS
TAG,Unnamed: 1_level_1,Unnamed: 2_level_1
assembly,144,144
c,144,144
c#,145,145
c++,144,144
delphi,144,144
go,129,129
java,144,144
javascript,144,144
perl,144,144
php,144,144


Now, I want to look at what type of data is the variable time in, how its displayed and processed, and convert it and manipulate it if needed for benefit of further analysis.

In [16]:
df['DATE'][1]

'2008-08-01 00:00:00'

In [18]:
type(df['DATE'][1])

str

here we understand that this is not in the most convienient format that we desires,  not only it is a string, it also contains the unnecessary 00:00:00, which is useless in this case. so I am looking to convert it into Datetime Objects.

In [19]:
df.DATE = pd.to_datetime(df.DATE)
df.head()

Unnamed: 0,DATE,TAG,POSTS
0,2008-07-01,c#,3
1,2008-08-01,assembly,8
2,2008-08-01,javascript,162
3,2008-08-01,c,85
4,2008-08-01,python,124


Here we converted it into Date time objects.