In [1]:
import pandas as pd
import networkx as nx
import community

import operator

In [2]:
df = pd.read_csv('../tags_with_wiki_relationship.csv')
df

Unnamed: 0,source,target,weight
0,c#,decimal,3081
1,c#,double,2895
2,c#,floating-point,1946
3,c#,type-conversion,4426
4,decimal,double,790
...,...,...,...
936601,database,parcelable,3
936602,web-config,windows-phone-7,4
936603,drag-and-drop,lag,12
936604,animation,markup,14


In [3]:
df_wiki = pd.read_csv('../tags_with_wiki_and_category.csv', lineterminator='\n')
df_wiki

Unnamed: 0,TagName,Body,root
0,java,java is a high-level object oriented programmi...,language
1,regex,Regular expressions provide a declarative lang...,
2,xml,xml (extensible markup language) is a structur...,format
3,mysql,mysql is a free open source relational databas...,system
4,nant,nant is a build tool for .net aiming to suppor...,tool
...,...,...,...
41778,minimal-apis,Minimal APIs were introduced in ASP.NET Core 6...,
41779,ix.net,the interactive extensions (ix) is a .net libr...,library
41780,amazon-memory-db,memorydb for redis is an in-memory database se...,service
41781,decomposition,Decomposition might refer to Time Series Decom...,


In [4]:
node_attr = df_wiki.set_index('TagName').to_dict('index')

In [5]:
Graphtype = nx.Graph()
G = nx.from_pandas_edgelist(df, edge_attr='weight', create_using=Graphtype)
nx.set_node_attributes(G, node_attr)

In [6]:
G.nodes['python']

{'Body': 'python is a multi-paradigm dynamically typed multi-purpose programming language.',
 'root': 'language'}

In [7]:
G['python']



In [8]:
# Find modularity
part_1 = community.best_partition(G, random_state = 27)
mod_1 = community.modularity(part_1,G)

In [9]:
part_1['python']

8

In [10]:
number_of_comm_1 = max(part_1.items(), key=operator.itemgetter(1))[1] + 1
number_of_comm_1

20

In [11]:
list_of_comm_1 = []
for i in range(number_of_comm_1):
    list_of_comm_1.append([k for k,v in part_1.items() if v == i])
    
list_of_comm_1[part_1['python']]

['csv',
 'color-space',
 'dictionary',
 'python',
 'cx-oracle',
 'random',
 'primes',
 'iteration',
 'django',
 'linear-algebra',
 'linear-equation',
 'itertools',
 'unicode',
 'text',
 'loops',
 'monkeypatching',
 'artificial-intelligence',
 'naivebayes',
 'parsing',
 'spss',
 'statistics',
 'html-parsing',
 'image-processing',
 'import',
 'list',
 'tuples',
 'ftplib',
 'gis',
 'html-content-extraction',
 'web-scraping',
 'tkinter',
 'ocr',
 'signal-processing',
 'text-files',
 'dct',
 'matlab',
 'utf-8',
 'graphing',
 'urllib',
 'if-statement',
 'plot',
 'zip',
 'encoding',
 'software-distribution',
 'inequality',
 'pylons',
 'python-2.x',
 'data-manipulation',
 'erp',
 'text-parsing',
 'set',
 'interpolation',
 'indentation',
 'module',
 'lxml',
 'doctest',
 'estimation',
 'human-readable',
 'robotics',
 'evolutionary-algorithm',
 'genetic-algorithm',
 'genetic-programming',
 'opencv',
 'rgb',
 'visualization',
 'ctypes',
 'python-sip',
 'encode',
 'yield',
 'packaging',
 'backport'

In [12]:
G_1 = G.subgraph(list_of_comm_1[part_1['python']])

In [13]:
G_1.edges

EdgeView([('bioinformatics', 'dna-sequence'), ('bioinformatics', 'python'), ('bioinformatics', 'fasta'), ('bioinformatics', 'cluster-analysis'), ('bioinformatics', 'biopython'), ('bioinformatics', 'blast'), ('bioinformatics', 'bioconductor'), ('bioinformatics', 'r'), ('bioinformatics', 'statistics'), ('bioinformatics', 'multiprocessing'), ('bioinformatics', 'elementtree'), ('bioinformatics', 'ncbi'), ('bioinformatics', 'iteration'), ('bioinformatics', 'visualization'), ('bioinformatics', 'xgrid'), ('bioinformatics', 'matlab'), ('bioinformatics', 'plot'), ('bioinformatics', 'matrix'), ('bioinformatics', 'scientific-computing'), ('bioinformatics', 'image-processing'), ('bioinformatics', 'mean'), ('bioinformatics', 'information-retrieval'), ('bioinformatics', 'signal-processing'), ('bioinformatics', 'csv'), ('bioinformatics', 'hierarchical-clustering'), ('bioinformatics', 'ape-phylo'), ('bioinformatics', 'probability'), ('bioinformatics', 'anova'), ('bioinformatics', 'mechanize'), ('bioin

In [14]:
part_2 = community.best_partition(G_1)
mod_2 = community.modularity(part_2,G_1)

In [15]:
sorted(G_1.degree, key=lambda x: x[1], reverse=True)

[('python', 1451),
 ('r', 534),
 ('django', 503),
 ('matlab', 418),
 ('parsing', 256),
 ('google-app-engine', 256),
 ('image-processing', 256),
 ('list', 255),
 ('python-3.x', 240),
 ('numpy', 235),
 ('unicode', 231),
 ('statistics', 224),
 ('matrix', 200),
 ('text', 196),
 ('machine-learning', 181),
 ('dictionary', 181),
 ('loops', 177),
 ('csv', 171),
 ('opencv', 171),
 ('plot', 170),
 ('artificial-intelligence', 163),
 ('encoding', 160),
 ('nlp', 154),
 ('random', 152),
 ('import', 152),
 ('module', 147),
 ('utf-8', 140),
 ('scipy', 140),
 ('wolfram-mathematica', 138),
 ('django-models', 133),
 ('matplotlib', 133),
 ('filter', 132),
 ('character-encoding', 132),
 ('computer-vision', 119),
 ('wxpython', 114),
 ('latex', 114),
 ('for-loop', 109),
 ('prolog', 104),
 ('data-mining', 104),
 ('neural-network', 102),
 ('python-imaging-library', 102),
 ('sqlalchemy', 100),
 ('django-admin', 98),
 ('twitter', 98),
 ('cluster-analysis', 97),
 ('pyqt', 95),
 ('generator', 95),
 ('set', 93),
 (

In [16]:
f = open("../filter/final_categories.txt", "r")
categories = f.read().split(", ")
categories

['component',
 'database',
 'extension',
 'environment',
 'language',
 'library',
 'platform',
 'subsystem',
 'framework',
 'ide',
 'tool',
 'toolkit',
 'utility',
 'system']

In [17]:
for a, b in sorted(G_1.degree, key=lambda x: x[1], reverse=True):
    if str(G.nodes[a]['root']) != "nan" and str(G.nodes[a]['root']) in categories:
        print(a + "," + str(b) + "," + G.nodes[a]['root'] + "," + node_attr[a]["Body"])
    #else:
    #    print(a + " " + str(b))

python,1451,language,python is a multi-paradigm dynamically typed multi-purpose programming language.
r,534,environment,r is a free open-source programming language & software environment for statistical computing bioinformatics visualization & general computing.
django,503,framework,django is an open-source server-side web application framework written in python.
matlab,418,language,matlab is a high-level language and interactive programming environment for numerical computation and visualization developed by mathworks.
numpy,235,extension,numpy is an extension of the python language that adds support to large multidimensional arrays and matrixes along with a large library of high-level mathematical functions for operations with these arrays.
csv,171,database,comma-separated values or character-separated values (csv) is a standard "flat file database" (or spreadsheet-style) format for storing tabular data in plain text with fields separated by a special character (comma tab etc).
open