### Importing Libraries

In [124]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Importing Dataset

In [11]:
df = pd.read_csv('browser_rankings_data.csv')
df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,browser_rankings
ID,Keyword,Rank,Country,Language,Date,App ID,Date of Last Description Change,Short Description,Long Description
1,browser,2,in,en,2020-12-16,net.fast.web.browser,2020-12-18,"The small, fastest, secure Web Browser, best mobile browser, fast search Google.","One of the smallest, fastest, secure Web Brows..."
2,browser,3,in,en,2020-12-16,com.android.chrome,2020-12-18,"Fast, simple, and secure. Google Chrome browser for Android phones and tablets.","Google Chrome is a fast, easy to use, and secu..."
3,browser,4,in,en,2020-12-16,com.opera.browser,2021-02-27,"Fast, safe web browser: free VPN, Ad blocker, QR-code reader &amp; personalized news","Fast, safe and private, introducing the latest..."
3,browser,4,in,en,2020-12-16,com.opera.browser,2020-12-18,"Fast, safe web browser: free VPN, Ad blocker, QR-code reader &amp; personalized news","Opera is a fast, safe web browser for your And..."


In [12]:
df.shape

(3067, 1)

In [13]:
df.columns

Index(['browser_rankings'], dtype='object')

In [None]:
df.reset_index(inplace=True)

In [15]:
df.head()

Unnamed: 0,level_0,level_1,level_2,level_3,level_4,level_5,level_6,level_7,level_8,browser_rankings
0,ID,Keyword,Rank,Country,Language,Date,App ID,Date of Last Description Change,Short Description,Long Description
1,1,browser,2,in,en,2020-12-16,net.fast.web.browser,2020-12-18,"The small, fastest, secure Web Browser, best m...","One of the smallest, fastest, secure Web Brows..."
2,2,browser,3,in,en,2020-12-16,com.android.chrome,2020-12-18,"Fast, simple, and secure. Google Chrome browse...","Google Chrome is a fast, easy to use, and secu..."
3,3,browser,4,in,en,2020-12-16,com.opera.browser,2021-02-27,"Fast, safe web browser: free VPN, Ad blocker, ...","Fast, safe and private, introducing the latest..."
4,3,browser,4,in,en,2020-12-16,com.opera.browser,2020-12-18,"Fast, safe web browser: free VPN, Ad blocker, ...","Opera is a fast, safe web browser for your And..."


In [18]:
df.iloc[0]

level_0                                          ID
level_1                                     Keyword
level_2                                        Rank
level_3                                     Country
level_4                                    Language
level_5                                        Date
level_6                                      App ID
level_7             Date of Last Description Change
level_8                           Short Description
browser_rankings                   Long Description
Name: 0, dtype: object

In [22]:
# Renaming the columns
df.columns = ['ID', 'Keyword', 'Rank', 'Country', 'Language', 'Date', 'App_ID', 'Date_of_Last_Description_Change', 'Short_Description', 'Long_Description']

In [24]:
df.columns

Index(['ID', 'Keyword', 'Rank', 'Country', 'Language', 'Date', 'App_ID',
       'Date_of_Last_Description_Change', 'Short_Description',
       'Long_Description'],
      dtype='object')

In [28]:
df.drop(0, inplace = True, axis=0)

In [30]:
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,ID,Keyword,Rank,Country,Language,Date,App_ID,Date_of_Last_Description_Change,Short_Description,Long_Description
0,1,browser,2,in,en,2020-12-16,net.fast.web.browser,2020-12-18,"The small, fastest, secure Web Browser, best m...","One of the smallest, fastest, secure Web Brows..."
1,2,browser,3,in,en,2020-12-16,com.android.chrome,2020-12-18,"Fast, simple, and secure. Google Chrome browse...","Google Chrome is a fast, easy to use, and secu..."
2,3,browser,4,in,en,2020-12-16,com.opera.browser,2021-02-27,"Fast, safe web browser: free VPN, Ad blocker, ...","Fast, safe and private, introducing the latest..."
3,3,browser,4,in,en,2020-12-16,com.opera.browser,2020-12-18,"Fast, safe web browser: free VPN, Ad blocker, ...","Opera is a fast, safe web browser for your And..."
4,4,browser,1,in,en,2020-12-16,com.cloudmosa.puffinTV,2021-02-07,Puffin TV Browser - One App to Watch All Video...,Puffin TV Browser is a special edition of Puff...


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066 entries, 0 to 3065
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   ID                               3066 non-null   object
 1   Keyword                          3066 non-null   object
 2   Rank                             3050 non-null   object
 3   Country                          3066 non-null   object
 4   Language                         3066 non-null   object
 5   Date                             3066 non-null   object
 6   App_ID                           3066 non-null   object
 7   Date_of_Last_Description_Change  3066 non-null   object
 8   Short_Description                3066 non-null   object
 9   Long_Description                 3066 non-null   object
dtypes: object(10)
memory usage: 239.7+ KB


In [53]:
df.isnull().sum()

ID                                  0
Keyword                             0
Rank                               16
Country                             0
Language                            0
Date                                0
App_ID                              0
Date_of_Last_Description_Change     0
Short_Description                   0
Long_Description                    0
dtype: int64

### Downloading Sentence Similarity model

In [66]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [54]:
kw_sd_ss = []
kw_ld_ss = []
sd_ld_ss = []

for i in range(len(df)):
    kw_sd = [df.Keyword[i], df.Short_Description[i]]
    kw_ld = [df.Keyword[i], df.Long_Description[i]]
    sd_ld = [df.Short_Description[i], df.Long_Description[i]]
    embeddings = model.encode(kw_sd)
    embeddings1 = model.encode(kw_ld)
    embeddings2 = model.encode(sd_ld)
    kw_sd_ss.append(round(cosine_similarity([embeddings[0]], embeddings[1:])[0][0], 2))
    kw_ld_ss.append(round(cosine_similarity([embeddings1[0]], embeddings1[1:])[0][0], 2))
    sd_ld_ss.append(round(cosine_similarity([embeddings2[0]], embeddings2[1:])[0][0], 2))

In [56]:
df['kw_sd_ss'] = kw_sd_ss
df['kw_ld_ss'] = kw_ld_ss
df['sd_ld_ss'] = sd_ld_ss

In [57]:
df.head()

Unnamed: 0,ID,Keyword,Rank,Country,Language,Date,App_ID,Date_of_Last_Description_Change,Short_Description,Long_Description,kw_sd_ss,kw_ld_ss,sd_ld_ss
0,1,browser,2,in,en,2020-12-16,net.fast.web.browser,2020-12-18,"The small, fastest, secure Web Browser, best m...","One of the smallest, fastest, secure Web Brows...",0.55,0.52,0.77
1,2,browser,3,in,en,2020-12-16,com.android.chrome,2020-12-18,"Fast, simple, and secure. Google Chrome browse...","Google Chrome is a fast, easy to use, and secu...",0.59,0.4,0.73
2,3,browser,4,in,en,2020-12-16,com.opera.browser,2021-02-27,"Fast, safe web browser: free VPN, Ad blocker, ...","Fast, safe and private, introducing the latest...",0.46,0.42,0.66
3,3,browser,4,in,en,2020-12-16,com.opera.browser,2020-12-18,"Fast, safe web browser: free VPN, Ad blocker, ...","Opera is a fast, safe web browser for your And...",0.46,0.45,0.72
4,4,browser,1,in,en,2020-12-16,com.cloudmosa.puffinTV,2021-02-07,Puffin TV Browser - One App to Watch All Video...,Puffin TV Browser is a special edition of Puff...,0.45,0.44,0.82


In [139]:
print(f'Correlation Coefficient between kw_sd_ss and kw_ld_ss: {round(np.corrcoef(df["kw_sd_ss"], df["kw_ld_ss"])[0][1], 2)}')

Correlation Coefficient between kw_sd_ss and kw_ld_ss: 0.39


In [141]:
print(f'Correlation Coefficient between kw_sd_ss and sd_ld_ss: {round(np.corrcoef(df["kw_sd_ss"], df["sd_ld_ss"])[0][1], 2)}')

Correlation Coefficient between kw_sd_ss and sd_ld_ss: 0.23


In [140]:
print(f'Correlation Coefficient between kw_ld_ss and sd_ld_ss: {round(np.corrcoef(df["kw_ld_ss"], df["sd_ld_ss"])[0][1], 2)}')

Correlation Coefficient between kw_ld_ss and sd_ld_ss: 0.07
