In [1]:
import numpy as np
import pandas as pd

In [2]:
### Load dataset

In [3]:
df = pd.read_csv('archive/github_issues_tickets.csv', low_memory=False)

#### Check number of rows and columns

In [4]:
df.shape

(15955, 132)

#### Show Column

In [5]:
print('These are the columns which are present in our dataset')
i = 1
for column in df.columns:
    print(i,'.', column)
    i += 1
del i

These are the columns which are present in our dataset
1 . answers_0_author
2 . answers_0_body
3 . answers_0_creation_time
4 . answers_1_author
5 . answers_1_body
6 . answers_1_creation_time
7 . answers_2_author
8 . answers_2_body
9 . answers_2_creation_time
10 . answers_3_author
11 . answers_3_body
12 . answers_3_creation_time
13 . answers_4_author
14 . answers_4_body
15 . answers_4_creation_time
16 . answers_5_author
17 . answers_5_body
18 . answers_5_creation_time
19 . answers_6_author
20 . answers_6_body
21 . answers_6_creation_time
22 . answers_7_author
23 . answers_7_body
24 . answers_7_creation_time
25 . answers_8_author
26 . answers_8_body
27 . answers_8_creation_time
28 . answers_9_author
29 . answers_9_body
30 . answers_9_creation_time
31 . assignee
32 . assignee_id
33 . assignee_login
34 . assignee_type
35 . assignee_url
36 . body
37 . closed_at
38 . comments
39 . comments_url
40 . created_at
41 . html_url
42 . id
43 . labels_0_color
44 . labels_0_default
45 . labels_0_descr

## Check for null values

In [6]:
df.isna().sum()

answers_0_author              0
answers_0_body                0
answers_0_creation_time       0
answers_1_author           3447
answers_1_body             3447
                           ... 
url                           0
user_id                       0
user_login                    0
user_type                     0
user_url                      0
Length: 132, dtype: int64

#### Print all Columns

In [7]:
print(df.isna().sum().to_string())

answers_0_author               0
answers_0_body                 0
answers_0_creation_time        0
answers_1_author            3447
answers_1_body              3447
answers_1_creation_time     3447
answers_2_author            6949
answers_2_body              6949
answers_2_creation_time     6949
answers_3_author            9520
answers_3_body              9520
answers_3_creation_time     9520
answers_4_author           11442
answers_4_body             11442
answers_4_creation_time    11442
answers_5_author           12841
answers_5_body             12841
answers_5_creation_time    12841
answers_6_author           13808
answers_6_body             13808
answers_6_creation_time    13808
answers_7_author           14616
answers_7_body             14616
answers_7_creation_time    14616
answers_8_author           15200
answers_8_body             15200
answers_8_creation_time    15200
answers_9_author           15636
answers_9_body             15636
answers_9_creation_time    15636
assignee  

#### Print Only Columns which have null values

In [8]:
# assign null values into missing_values variable
missing_values = df.isna().sum()
columns_with_missing = missing_values[missing_values > 0] # Filter columns with missing values

In [9]:
print(columns_with_missing.to_string())

answers_1_author            3447
answers_1_body              3447
answers_1_creation_time     3447
answers_2_author            6949
answers_2_body              6949
answers_2_creation_time     6949
answers_3_author            9520
answers_3_body              9520
answers_3_creation_time     9520
answers_4_author           11442
answers_4_body             11442
answers_4_creation_time    11442
answers_5_author           12841
answers_5_body             12841
answers_5_creation_time    12841
answers_6_author           13808
answers_6_body             13808
answers_6_creation_time    13808
answers_7_author           14616
answers_7_body             14616
answers_7_creation_time    14616
answers_8_author           15200
answers_8_body             15200
answers_8_creation_time    15200
answers_9_author           15636
answers_9_body             15636
answers_9_creation_time    15636
assignee                   15955
assignee_id                12497
assignee_login             12497
assignee_t

**Note:** `Most of the columns have large number of missing values`

#### Check info of dataset

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15955 entries, 0 to 15954
Columns: 132 entries, answers_0_author to user_url
dtypes: float64(15), int64(12), object(105)
memory usage: 16.1+ MB


#### Show reaction columns name

In [11]:
for i in range(112, 122):
    print(df.columns[i], end=', ')

reactions_confused, reactions_eyes, reactions_heart, reactions_hooray, reactions_laugh, reactions_minus_1, reactions_plus_1, reactions_rocket, reactions_total_count, reactions_url, 

## Let's Store Reactions columns in list

In [12]:
reaction_columns = df.columns[112:121]

#### Show Maximum value of reaction

In [13]:
i = 1
for reaction in reaction_columns:
    print(f'{i}. {reaction} {df[reaction].max()}')
    i += 1
del i

1. reactions_confused 2
2. reactions_eyes 8
3. reactions_heart 5
4. reactions_hooray 3
5. reactions_laugh 3
6. reactions_minus_1 3
7. reactions_plus_1 46
8. reactions_rocket 4
9. reactions_total_count 46


#### Show Minimum value of reaction

In [14]:
i = 1
for reaction in reaction_columns:
    print(f'{i}. {reaction} {df[reaction].min()}')
    i += 1
del i

1. reactions_confused 0
2. reactions_eyes 0
3. reactions_heart 0
4. reactions_hooray 0
5. reactions_laugh 0
6. reactions_minus_1 0
7. reactions_plus_1 0
8. reactions_rocket 0
9. reactions_total_count 0


#### Sort Dataset on the basis of reaction

In [15]:
df.sort_values(by='reactions_total_count', inplace=True)

#### Show Top 5 Values

In [16]:
df['reactions_total_count'].tail()

8423     24
10661    25
7569     25
5401     28
2371     46
Name: reactions_total_count, dtype: int64

#### Show Low reaction

In [17]:
df['reactions_total_count'].head()

0        0
10218    0
10219    0
10220    0
10221    0
Name: reactions_total_count, dtype: int64

#### Convert Date Into Date

In [18]:
filtered_df = df[df['milestone_due_on'].notnull()]
print('Values which are not null in milestone_due_on are')
i = 1
for row in filtered_df['milestone_due_on']:
    print(f'{i}. {row}')
    if i == 5:
        break
    i += 1
del i

Values which are not null in milestone_due_on are
1. 2021-10-29T07:00:00Z
2. 2020-10-31T07:00:00Z
3. 2023-03-13T07:00:00Z
4. 2023-01-07T08:00:00Z
5. 2023-05-16T07:00:00Z


In [19]:
df['milestone_due_on'].info()

<class 'pandas.core.series.Series'>
Index: 15955 entries, 0 to 2371
Series name: milestone_due_on
Non-Null Count  Dtype 
--------------  ----- 
855 non-null    object
dtypes: object(1)
memory usage: 249.3+ KB


In [20]:
df['milestone_due_on'] = pd.to_datetime(df['milestone_due_on'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')


In [21]:
df['milestone_due_on'].info()

<class 'pandas.core.series.Series'>
Index: 15955 entries, 0 to 2371
Series name: milestone_due_on
Non-Null Count  Dtype         
--------------  -----         
854 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 249.3 KB
