# Initial Exploratory Data Analysis

A short exploratory notebook


In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data/birth_year.csv")
data["posts_total_lengths"] = data["post"].apply(len)
data.head(5)

FileNotFoundError: [Errno 2] No such file or directory: './Data/birth_year.csv'

In [58]:
data.describe()

Unnamed: 0,birth_year,posts_total_lengths
count,41873.0,41873.0
mean,1988.028849,8016.621164
std,10.850916,871.179432
min,1948.0,1504.0
25%,1982.0,7929.0
50%,1990.0,8118.0
75%,1996.0,8321.0
max,2010.0,30707.0


## simple word occurences counting
mostly checking whether common methods will work out of the box

In [125]:

# this method has major issues and should NOT be used in any other code
# its also really slow
# - arbitrary minimum frequency

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5) # requires 5 frequency minimum
X = vectorizer.fit_transform(data['post']) # returns a sparse matrix np object. min_df sets the minimum frequency to add it
# we cannot convert this to a dataframe easily, as the memory footprint will be over 22gb!
# thus keep it as a sparse matrix
# all following code is terrible, BUT prevents the memory footprint from becoming an issue
# (all aggregation/sum/conversion to dataframe/array will result in these memory issues!)
X

<41873x92244 sparse matrix of type '<class 'numpy.int64'>'
	with 24185029 stored elements in Compressed Sparse Row format>

In [187]:
columns=vectorizer.get_feature_names_out()
columns
X[0,:]

<1x92244 sparse matrix of type '<class 'numpy.int64'>'
	with 568 stored elements in Compressed Sparse Row format>

In [202]:
word_occurences = pd.DataFrame(columns = columns)
word_occurences

Unnamed: 0,00,000,0000,000000,0000001,000001,00001,0001,000ish,000km,...,市民请注意,您的,浏览器历史记录和活动引起了我们的注意,由人民供应部重新分配,社会信用,ﾟヮﾟ,𓌏ඞ,𓌏ඞ𓊹ꭿ,𓌜ඞ,𓌱ඞ


In [206]:
years = data['birth_year'].unique()
years.sort()

# horribly inneficient, but with low memory usage
# if you know how to work with sparse matrixes it could probably easily be improved 
# ( by instead summing the sparse matrix before conversion to df)
submatrices = []
for year in years:
    mask = data["birth_year"] == year
    indexes = np.where(mask)[0]
    print("Birthyear: ", year, " occurs ", len(indexes), " times ")
    submatrix = pd.DataFrame.sparse.from_spmatrix(X[indexes, :]).sum() # really bad (but works)
    submatrix["year"] = year
    submatrices.append(submatrix)


Birthyear:  1948  occurs  22  times 
Birthyear:  1949  occurs  65  times 
Birthyear:  1951  occurs  44  times 
Birthyear:  1953  occurs  46  times 
Birthyear:  1954  occurs  129  times 
Birthyear:  1955  occurs  2  times 
Birthyear:  1956  occurs  1  times 
Birthyear:  1957  occurs  191  times 
Birthyear:  1958  occurs  47  times 
Birthyear:  1959  occurs  22  times 
Birthyear:  1960  occurs  91  times 
Birthyear:  1961  occurs  50  times 
Birthyear:  1962  occurs  463  times 
Birthyear:  1963  occurs  203  times 
Birthyear:  1964  occurs  100  times 
Birthyear:  1965  occurs  316  times 
Birthyear:  1966  occurs  293  times 
Birthyear:  1967  occurs  265  times 
Birthyear:  1968  occurs  204  times 
Birthyear:  1969  occurs  254  times 
Birthyear:  1970  occurs  689  times 
Birthyear:  1971  occurs  317  times 
Birthyear:  1972  occurs  380  times 
Birthyear:  1973  occurs  383  times 
Birthyear:  1974  occurs  379  times 
Birthyear:  1975  occurs  278  times 
Birthyear:  1976  occurs

0           3
1           1
2           0
3           0
4           0
         ... 
92240       0
92241       0
92242       0
92243       0
year     2010
Length: 5534700, dtype: int64

In [241]:
result = pd.concat(submatrices, axis=1)
result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,3,2,2,4,4,0,0,15,1,0,...,11,17,9,13,5,6,44,7,2,0
1,1,84,8,13,1,0,1,52,12,1,...,69,171,173,27,25,50,115,14,0,2
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92241,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [242]:
# result requires some fixing with regards to colnames/rows
result.columns = result.loc["year"]
result = result.drop("year", axis=0)
result = result.set_index(columns)
result['Total'] = result.sum(axis=1)
result

year,1948,1949,1951,1953,1954,1955,1956,1957,1958,1959,...,2001,2002,2003,2004,2005,2006,2007,2008,2010,Total
00,3,2,2,4,4,0,0,15,1,0,...,17,9,13,5,6,44,7,2,0,1161
000,1,84,8,13,1,0,1,52,12,1,...,171,173,27,25,50,115,14,0,2,6186
0000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
000000,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,5
0000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ﾟヮﾟ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
𓌏ඞ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,840
𓌏ඞ𓊹ꭿ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,56
𓌜ඞ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,896


In [243]:
result.sort_values("Total", ascending=False)

year,1948,1949,1951,1953,1954,1955,1956,1957,1958,1959,...,2001,2002,2003,2004,2005,2006,2007,2008,2010,Total
the,1288,3861,2321,2273,5960,116,51,8921,2055,1315,...,49893,27302,16900,12850,16016,28107,11755,2519,1288,1999529
to,857,2124,1735,1498,4533,89,54,7686,2256,940,...,41329,20219,13578,11292,13080,17649,8197,2739,772,1669899
and,528,2297,2083,1238,4437,65,31,6463,1750,840,...,36992,20231,13105,11449,11681,20283,6553,1921,708,1497419
you,661,1140,858,1022,6304,44,13,4243,1706,568,...,25069,11933,9950,9994,9807,13599,7903,1667,775,1150969
it,411,1519,1086,1120,1202,44,19,4832,1142,565,...,29750,14367,9344,9140,9697,10465,7196,2356,785,1141075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
leaden,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
leaderships,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
leaped,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,5
leaper,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,5
