In [1]:
# !conda install scikit-learn
# !conda install statsmodels
# !pip install pyreadstat

In [2]:
import pandas as pd
import statsmodels.formula.api as sm
import numpy as np
from sklearn import linear_model

In [3]:
reddit = pd.read_csv('reddit_data_example.csv')

In [4]:
mobile_use = pd.read_spss('mobile-use.sav')

In [5]:
mobile_use.head()

Unnamed: 0,Age,spBrand,spAge,spWBUseFreq,spSEUseFreq,spComfInstallBrowser,spComfDefaultBrowserChange,spComfDefaultSearchChange,Sex,GUID,spComfInstallBrowser_recode,spComfDefaultBrowserChange_recode,spComfDefaultSearchChange_recode,Ethnicity,Education,Income,spComfort
0,38.0,HTC,Less than 1 year old,Many times a day,Many times a day,Very uncomfortable,Very uncomfortable,Very uncomfortable,Female,0007a34a-df0f-ff99-dfc8-942cbfd6dba1,0.0,0.0,0.0,Black or African American,Secondary School,,0.0
1,46.0,,,,,Very comfortable,Very comfortable,Very comfortable,Female,00282bac-006c-7aaf-a02e-1c038f3f9441,4.0,4.0,4.0,White,None completed,"$0 - $14,999",4.0
2,29.0,,,,,Comfortable,Comfortable,Comfortable,Female,00a4c861-1c1d-e660-60e3-ff8f0d54db96,3.0,3.0,3.0,White,University/Higher Education,"$15,000 - $24,999",3.0
3,62.0,Google,1 to less than 2 years old,At least once a day,At least once per week,Neither uncomfortable nor comfortable,Uncomfortable,Very comfortable,Female,014ab9a2-1f31-8d9c-8e20-8e614dd9174f,2.0,1.0,4.0,Asian,High School/Tertiary/Tech. College,,2.333333
4,48.0,Don’t know smartphone brand,Don’t know,At least once per month,At least once per month,Comfortable,Comfortable,Comfortable,Female,01cf5015-1cc1-5cc2-0b97-67e45a2e18f2,3.0,3.0,3.0,White,University/Higher Education,,3.0


In [6]:
mobile_use.dtypes

Age                                   float64
spBrand                              category
spAge                                category
spWBUseFreq                          category
spSEUseFreq                          category
spComfInstallBrowser                 category
spComfDefaultBrowserChange           category
spComfDefaultSearchChange            category
Sex                                  category
GUID                                   object
spComfInstallBrowser_recode           float64
spComfDefaultBrowserChange_recode     float64
spComfDefaultSearchChange_recode      float64
Ethnicity                              object
Education                              object
Income                                 object
spComfort                             float64
dtype: object

In [7]:
mobile_use['spAge']

0             Less than 1 year old
1                              NaN
2                              NaN
3       1 to less than 2 years old
4                       Don’t know
                   ...            
1218                           NaN
1219          Less than 1 year old
1220                           NaN
1221    2 to less than 3 years old
1222    1 to less than 2 years old
Name: spAge, Length: 1223, dtype: category
Categories (7, object): ['1 to less than 2 years old', '2 to less than 3 years old', '3 to less than 4 years old', '4 to less than 5 years old', '5 or more years old', 'Don’t know', 'Less than 1 year old']

In [8]:
mobile_use['spAge_r'] = mobile_use['spAge'] \
.replace(['1 to less than 2 years old',
          '2 to less than 3 years old',
          '3 to less than 4 years old',
          '4 to less than 5 years old',
          '5 or more years old',
          'Don’t know',
          'Less than 1 year old'],
        [2, 3, 4, 5, 6, np.nan, 1])

In [9]:
sex_dummies = pd.get_dummies(mobile_use['Sex'], drop_first = True)
sex_dummies.head()

Unnamed: 0,Male
0,0
1,0
2,0
3,0
4,0


In [10]:
mobile_use.groupby('spBrand')['GUID'] \
.count() \
.sort_values(ascending = False)

spBrand
Apple                                   304
Samsung                                 279
Motorola                                 85
LG                                       75
Google                                   64
Other brand (please specify)             22
Blackberry                               11
Oppo                                     10
OnePlus                                   9
Don’t know smartphone brand               9
Microsoft (Android operating System)      9
Microsoft (Windows operating System)      9
HTC                                       9
Nokia                                     8
Lenovo                                    7
Sony                                      7
Honor                                     5
ZTE                                       5
Vivo                                      4
Huawei                                    3
RealMe                                    3
Xiaomi                                    3
Name: GUID, dtype: int64

In [11]:
brand_dummies = pd.get_dummies(mobile_use['spBrand'], drop_first = True)
brand_dummies.head()

Unnamed: 0,Blackberry,Don’t know smartphone brand,Google,HTC,Honor,Huawei,LG,Lenovo,Microsoft (Android operating System),Microsoft (Windows operating System),...,Nokia,OnePlus,Oppo,Other brand (please specify),RealMe,Samsung,Sony,Vivo,Xiaomi,ZTE
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
model_data = pd.DataFrame({
    'Comfort': mobile_use['spComfort'],
    'Age': mobile_use['Age'],
    'spAge': mobile_use['spAge_r'],
    'Sex': sex_dummies['Male']})

In [13]:
model_data = model_data.join(brand_dummies[['Samsung', 'Motorola', 'LG', 'Google']])

In [14]:
model_data = model_data.dropna()

In [15]:
model_data

Unnamed: 0,Comfort,Age,spAge,Sex,Samsung,Motorola,LG,Google
0,0.000000,38.0,1,0,0,0,0,0
3,2.333333,62.0,2,0,0,0,0,1
5,4.000000,35.0,2,1,1,0,0,0
6,3.000000,62.0,2,0,1,0,0,0
7,4.000000,41.0,2,1,0,0,1,0
...,...,...,...,...,...,...,...,...
1216,3.333333,31.0,5,0,0,0,0,1
1217,0.000000,77.0,1,0,1,0,0,0
1219,3.000000,61.0,1,0,1,0,0,0
1221,2.333333,22.0,3,1,0,0,0,0


In [16]:
model_data['spAge'] = model_data['spAge'].astype('int64')

In [17]:
model_1 = sm.ols('Comfort ~ spAge + Samsung + Motorola + LG + Google + Age + Sex', data = model_data)

In [18]:
results_1 = model_1.fit()

In [19]:
results_1.params

Intercept    2.743660
spAge        0.025868
Samsung      0.354890
Motorola     0.033473
LG           0.244952
Google       0.030455
Age         -0.014904
Sex          0.336894
dtype: float64

In [20]:
len(results_1.params)

8

In [21]:
np.identity(8)

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [22]:
A = np.identity(8)
A[1:, :]

array([[0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [23]:
results_1.f_test(A[1:, :])

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=11.790512404385419, p=1.9915501974595202e-14, df_denom=909, df_num=7>

In [24]:
results_1.fvalue, results_1.f_pvalue

(11.790512404384803, 1.9915501974632348e-14)

In [25]:
np.zeros_like(results_1.params)

array([0., 0., 0., 0., 0., 0., 0., 0.])

In [26]:
results_1.t_test('spAge = 0, Samsung = 0, Motorola = 0, LG = 0, Google = 0, Age = 0, Sex = 0')

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.0259      0.028      0.908      0.364      -0.030       0.082
c1             0.3549      0.090      3.926      0.000       0.178       0.532
c2             0.0335      0.140      0.238      0.812      -0.242       0.309
c3             0.2450      0.148      1.653      0.099      -0.046       0.536
c4             0.0305      0.161      0.190      0.850      -0.285       0.346
c5            -0.0149      0.002     -6.747      0.000      -0.019      -0.011
c6             0.3369      0.077      4.351      0.000       0.185       0.489

In [27]:
results_1.t_test(A[1:, :])

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.0259      0.028      0.908      0.364      -0.030       0.082
c1             0.3549      0.090      3.926      0.000       0.178       0.532
c2             0.0335      0.140      0.238      0.812      -0.242       0.309
c3             0.2450      0.148      1.653      0.099      -0.046       0.536
c4             0.0305      0.161      0.190      0.850      -0.285       0.346
c5            -0.0149      0.002     -6.747      0.000      -0.019      -0.011
c6             0.3369      0.077      4.351      0.000       0.185       0.489

In [28]:
reddit.head()

Unnamed: 0,Date,Domain,Sentiment,Language,Author,Thread Entry Type,Reddit Author Awardee Karma,Reddit Author Awarder Karma,Reddit Author Karma,Reddit Comments,Reddit Score,Reddit Score Upvote Ratio
0,2/9/2023 20:12,reddit.com,neutral,en,AutoModerator,reply,2397363,0,2000,,1,
1,2/9/2023 17:42,reddit.com,neutral,en,AutoModerator,reply,2397218,0,2000,,1,
2,2/9/2023 17:41,reddit.com,neutral,en,AutoModerator,reply,2397218,0,2000,,1,
3,2/8/2023 22:12,reddit.com,negative,en,KaylaST,reply,110,14,1470,,1,
4,2/8/2023 20:15,reddit.com,negative,en,CaveLady3000,post,311,73,4148,0.0,1,1.0


In [29]:
reddit = reddit.astype({'Date': 'datetime64',
                        'Sentiment': 'category',
                        'Language': 'category',
                        'Author': 'category',
                        'Thread Entry Type':'category'})

In [30]:
reddit.head()

Unnamed: 0,Date,Domain,Sentiment,Language,Author,Thread Entry Type,Reddit Author Awardee Karma,Reddit Author Awarder Karma,Reddit Author Karma,Reddit Comments,Reddit Score,Reddit Score Upvote Ratio
0,2023-02-09 20:12:00,reddit.com,neutral,en,AutoModerator,reply,2397363,0,2000,,1,
1,2023-02-09 17:42:00,reddit.com,neutral,en,AutoModerator,reply,2397218,0,2000,,1,
2,2023-02-09 17:41:00,reddit.com,neutral,en,AutoModerator,reply,2397218,0,2000,,1,
3,2023-02-08 22:12:00,reddit.com,negative,en,KaylaST,reply,110,14,1470,,1,
4,2023-02-08 20:15:00,reddit.com,negative,en,CaveLady3000,post,311,73,4148,0.0,1,1.0


In [31]:
reddit.groupby('Author')['Date'].count().sort_values(ascending = False).head(30)

Author
AutoModerator          1861
HeartTelegraph2         178
happyduck18             118
EnsconcedScone          115
anonymous_october        99
ClassicReply             78
theycallmesasha          75
wave-life                69
Love4SaveFerris          62
darling_di               59
ut3naa                   56
NFCourt                  55
bean-diccted             50
memegoddess2004          44
sinisasin                42
Jammertal17              41
FiguringItOut--          40
blurryleo                39
depressoacc              38
thirdeyeherbivore        37
throwawayfaptingz        35
AllieKat23               34
Paradox_Blobfish         33
All4-1-4All              33
iloveyoubecauseican      32
Elizabeth2586            30
orbit4eva                30
rhra99                   30
118434                   29
lets_kick_this           28
Name: Date, dtype: int64

In [32]:
reddit.groupby('Author')['Reddit Score'] \
.describe() \
.sort_values('count', ascending = False) \
.head(30)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AutoModerator,1861.0,0.871037,0.45262,-3.0,1.0,1.0,1.0,8.0
HeartTelegraph2,178.0,1.123596,3.948311,-2.0,0.0,0.0,0.0,36.0
happyduck18,118.0,5.635593,11.863659,0.0,0.0,2.0,4.75,81.0
EnsconcedScone,115.0,1.443478,2.759962,0.0,0.0,0.0,2.0,16.0
anonymous_october,99.0,5.616162,10.200747,0.0,0.0,2.0,4.0,56.0
ClassicReply,78.0,4.641026,6.654701,0.0,1.0,2.0,5.0,32.0
theycallmesasha,75.0,7.746667,10.248393,0.0,2.0,3.0,9.0,51.0
wave-life,69.0,3.333333,6.225344,1.0,1.0,2.0,3.0,36.0
Love4SaveFerris,62.0,2.16129,2.341315,0.0,1.0,1.0,2.0,12.0
darling_di,59.0,1.813559,5.802718,0.0,0.0,0.0,2.0,42.0


In [33]:
reddit.groupby('Author')['Reddit Score'] \
.describe() \
.sort_values('mean', ascending = False) \
.head(30)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aberletsc,1.0,90.0,,90.0,90.0,90.0,90.0,90.0
Lumenesa,1.0,54.0,,54.0,54.0,54.0,54.0,54.0
paganporridge,1.0,53.0,,53.0,53.0,53.0,53.0,53.0
endless-strive,1.0,46.0,,46.0,46.0,46.0,46.0,46.0
throwsAndb0x,1.0,41.0,,41.0,41.0,41.0,41.0,41.0
CorgiLover831,1.0,41.0,,41.0,41.0,41.0,41.0,41.0
Obligation-Mediocre,2.0,41.0,24.041631,24.0,32.5,41.0,49.5,58.0
esotericspeech,1.0,39.0,,39.0,39.0,39.0,39.0,39.0
PengTrash,1.0,37.0,,37.0,37.0,37.0,37.0,37.0
sadauntrbn,1.0,35.0,,35.0,35.0,35.0,35.0,35.0


In [34]:
reddit_no_mod = reddit[reddit['Author'] != 'AutoModerator']

In [35]:
reddit = reddit_no_mod.reset_index()

In [36]:
thread_type_dummies = pd.get_dummies(reddit['Thread Entry Type'], drop_first = True)

In [37]:
thread_type_dummies.head()

Unnamed: 0,reply
0,1
1,0
2,1
3,1
4,1


In [38]:
sentiment_dummies = pd.get_dummies(reddit['Sentiment'])

In [39]:
sentiment_dummies.head()

Unnamed: 0,negative,neutral,positive
0,1,0,0
1,1,0,0
2,1,0,0
3,0,1,0
4,0,1,0


In [40]:
reddit_model_data = pd.DataFrame({
    'Score': reddit['Reddit Score'],
    'Reply': thread_type_dummies['reply'],
    'Negative': sentiment_dummies['negative'],
    'Positive': sentiment_dummies['positive']})

In [41]:
reddit_model_data.head()

Unnamed: 0,Score,Reply,Negative,Positive
0,1,1,1,0
1,1,0,1,0
2,5,1,1,0
3,1,1,0,0
4,1,1,0,0


In [42]:
model_2 = sm.ols('Score ~ Reply + Negative + Positive', data = reddit_model_data)
results_2 = model_2.fit()

In [43]:
results_2.params

Intercept    15.023865
Reply       -12.764277
Negative      1.155742
Positive      0.273196
dtype: float64

In [44]:
results_2.f_test(np.identity(4)[1:, :])

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=591.6626696555738, p=0.0, df_denom=6.27e+03, df_num=3>

In [45]:
results_2.fvalue, results_2.f_pvalue

(591.662669655574, 0.0)

In [46]:
results_2.t_test(np.identity(4)[1:, :])

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0           -12.7643      0.312    -40.883      0.000     -13.376     -12.152
c1             1.1557      0.250      4.624      0.000       0.666       1.646
c2             0.2732      0.274      0.998      0.318      -0.263       0.810